1use std::collections::HashMap;
2use std::sync::Arc;
3
4use algocline_core::pkg::PkgType;
5
6use super::eval_store::{
7 escape_for_lua_sq, evals_dir, extract_strategy_from_id, list_eval_history, save_compare_result,
8 save_eval_result, splice_response_string,
9};
10use super::path::ContainedPath;
11use super::resolve::{is_package_installed, resolve_scenario_code};
12use super::run::normalize_stringified_json_object;
13use super::AppService;
14
15const STD_SHIM: &str = r#"
18std = {
19 json = {
20 decode = alc.json_decode,
21 encode = alc.json_encode,
22 },
23 fs = {
24 read = function(path)
25 local f, err = io.open(path, "r")
26 if not f then error("std.fs.read: " .. (err or path), 2) end
27 local content = f:read("*a")
28 f:close()
29 return content
30 end,
31 is_file = function(path)
32 local f = io.open(path, "r")
33 if f then f:close(); return true end
34 return false
35 end,
36 },
37 time = {
38 now = alc.time,
39 },
40}
41"#;
42
43impl AppService {
44 pub async fn eval(
58 &self,
59 scenario: Option<String>,
60 scenario_file: Option<String>,
61 scenario_name: Option<String>,
62 strategy: &str,
63 strategy_opts: Option<serde_json::Value>,
64 auto_card: bool,
65 ) -> Result<String, String> {
66 let app_dir = self.log_config.app_dir();
68 if !is_package_installed(&app_dir, "evalframe") {
69 self.auto_install_bundled_packages().await?;
70 if !is_package_installed(&app_dir, "evalframe") {
71 return Err(
72 "Package 'evalframe' not found after installing bundled collection. \
73 Use alc_pkg_install to install it manually."
74 .into(),
75 );
76 }
77 }
78
79 if let Some((PkgType::Library, _)) = self.resolve_pkg_type_lua(strategy).await? {
81 return Err(format!(
82 "Package '{strategy}' is a library package (type = \"library\"). \
83 Library packages cannot be evaluated as strategies. \
84 Use a runnable package instead."
85 ));
86 }
87
88 let scenario_code =
89 resolve_scenario_code(&app_dir, scenario, scenario_file, scenario_name.clone())?;
90
91 let strategy_opts = strategy_opts.map(normalize_stringified_json_object);
93 let opts_lua = match &strategy_opts {
94 Some(v) if !v.is_null() => format!("alc.json_decode('{}')", v),
95 _ => "nil".to_string(),
96 };
97
98 let auto_card_lua = if auto_card { "true" } else { "false" };
99
100 let wrapped = format!(
104 r#"{std_shim}
105
106local scenario = (function()
107{scenario_code}
108end)()
109
110return alc.eval(scenario, "{strategy}", {{
111 strategy_opts = {opts_lua},
112 auto_card = {auto_card_lua},
113}})
114"#,
115 std_shim = STD_SHIM,
116 );
117
118 let ctx = serde_json::Value::Null;
119 let env_map = Arc::new(HashMap::new());
122 let result = self
123 .start_and_tick(env_map, wrapped, ctx, Some(strategy), vec![], vec![])
124 .await?;
125
126 let mut save_warning: Option<String> = None;
129 if let Ok(parsed) = serde_json::from_str::<serde_json::Value>(&result) {
130 match parsed.get("status").and_then(|s| s.as_str()) {
131 Some("completed") => {
132 if let Err(e) = save_eval_result(&app_dir, strategy, &result) {
133 save_warning = Some(e);
134 }
135 }
136 Some("needs_response") => {
137 if let Some(sid) = parsed.get("session_id").and_then(|s| s.as_str()) {
138 if let Ok(mut map) = self.eval_sessions.lock() {
139 map.insert(sid.to_string(), strategy.to_string());
140 }
141 }
142 }
143 _ => {}
144 }
145 }
146
147 match save_warning {
148 Some(msg) => Ok(splice_response_string(&result, "save_warning", &msg)),
149 None => Ok(result),
150 }
151 }
152
153 pub fn eval_history(&self, strategy: Option<&str>, limit: usize) -> Result<String, String> {
155 let dir = evals_dir(&self.log_config.app_dir());
156 list_eval_history(&dir, strategy, limit)
157 }
158
159 pub fn eval_detail(&self, eval_id: &str) -> Result<String, String> {
161 let evals_dir = evals_dir(&self.log_config.app_dir());
162 let path = ContainedPath::child(&evals_dir, &format!("{eval_id}.json"))
163 .map_err(|e| format!("Invalid eval_id: {e}"))?;
164 if !path.exists() {
165 return Err(format!("Eval result not found: {eval_id}"));
166 }
167 std::fs::read_to_string(&*path).map_err(|e| format!("Failed to read eval: {e}"))
168 }
169
170 pub async fn eval_compare(&self, eval_id_a: &str, eval_id_b: &str) -> Result<String, String> {
179 let app_dir = self.log_config.app_dir();
180 let cache_filename = format!("compare_{eval_id_a}_vs_{eval_id_b}.json");
182 {
183 let dir = evals_dir(&app_dir);
184 if let Ok(cached_path) = ContainedPath::child(&dir, &cache_filename) {
185 if cached_path.exists() {
186 return std::fs::read_to_string(&*cached_path)
187 .map_err(|e| format!("Failed to read cached comparison: {e}"));
188 }
189 }
190 }
191
192 if !is_package_installed(&app_dir, "evalframe") {
194 self.auto_install_bundled_packages().await?;
195 if !is_package_installed(&app_dir, "evalframe") {
196 return Err(
197 "Package 'evalframe' not found after installing bundled collection. \
198 Use alc_pkg_install to install it manually."
199 .into(),
200 );
201 }
202 }
203
204 let result_a = self.eval_detail(eval_id_a)?;
205 let result_b = self.eval_detail(eval_id_b)?;
206
207 let lua_code = format!(
210 r#"{std_shim}
211
212local stats = require("evalframe.eval.stats")
213
214local result_a = alc.json_decode('{result_a_escaped}')
215local result_b = alc.json_decode('{result_b_escaped}')
216
217local agg_a = result_a.result and result_a.result.aggregated
218local agg_b = result_b.result and result_b.result.aggregated
219
220if not agg_a or not agg_a.scores then
221 error("No aggregated scores in {eval_id_a}")
222end
223if not agg_b or not agg_b.scores then
224 error("No aggregated scores in {eval_id_b}")
225end
226
227local welch = stats.welch_t(agg_a.scores, agg_b.scores)
228
229local strategy_a = (result_a.result and result_a.result.name) or "{strategy_a_fallback}"
230local strategy_b = (result_b.result and result_b.result.name) or "{strategy_b_fallback}"
231
232local delta = agg_a.scores.mean - agg_b.scores.mean
233local winner = "none"
234if welch.significant then
235 winner = delta > 0 and "a" or "b"
236end
237
238-- Build summary text
239local parts = {{}}
240if welch.significant then
241 local w, l, d = strategy_a, strategy_b, delta
242 if delta < 0 then w, l, d = strategy_b, strategy_a, -delta end
243 parts[#parts + 1] = string.format(
244 "%s outperforms %s by %.4f (mean score), statistically significant (t=%.3f, df=%.1f).",
245 w, l, d, math.abs(welch.t_stat), welch.df
246 )
247else
248 parts[#parts + 1] = string.format(
249 "No statistically significant difference between %s and %s (t=%.3f, df=%.1f).",
250 strategy_a, strategy_b, math.abs(welch.t_stat), welch.df
251 )
252end
253if agg_a.pass_rate and agg_b.pass_rate then
254 local dp = agg_a.pass_rate - agg_b.pass_rate
255 if math.abs(dp) > 1e-9 then
256 local h = dp > 0 and strategy_a or strategy_b
257 parts[#parts + 1] = string.format("Pass rate: %s +%.1fpp.", h, math.abs(dp) * 100)
258 else
259 parts[#parts + 1] = string.format("Pass rate: identical (%.1f%%).", agg_a.pass_rate * 100)
260 end
261end
262
263return {{
264 a = {{
265 eval_id = "{eval_id_a}",
266 strategy = strategy_a,
267 scores = agg_a.scores,
268 pass_rate = agg_a.pass_rate,
269 pass_at_1 = agg_a.pass_at_1,
270 ci_95 = agg_a.ci_95,
271 }},
272 b = {{
273 eval_id = "{eval_id_b}",
274 strategy = strategy_b,
275 scores = agg_b.scores,
276 pass_rate = agg_b.pass_rate,
277 pass_at_1 = agg_b.pass_at_1,
278 ci_95 = agg_b.ci_95,
279 }},
280 comparison = {{
281 delta_mean = delta,
282 welch_t = {{
283 t_stat = welch.t_stat,
284 df = welch.df,
285 significant = welch.significant,
286 direction = welch.direction,
287 }},
288 winner = winner,
289 summary = table.concat(parts, " "),
290 }},
291}}
292"#,
293 result_a_escaped = escape_for_lua_sq(&result_a),
294 result_b_escaped = escape_for_lua_sq(&result_b),
295 eval_id_a = eval_id_a,
296 eval_id_b = eval_id_b,
297 std_shim = STD_SHIM,
298 strategy_a_fallback = extract_strategy_from_id(eval_id_a).unwrap_or("A"),
299 strategy_b_fallback = extract_strategy_from_id(eval_id_b).unwrap_or("B"),
300 );
301
302 let ctx = serde_json::Value::Null;
303 let env_map = Arc::new(HashMap::new());
306 let raw_result = self
307 .start_and_tick(env_map, lua_code, ctx, None, vec![], vec![])
308 .await?;
309
310 match save_compare_result(&app_dir, eval_id_a, eval_id_b, &raw_result) {
314 Ok(()) => Ok(raw_result),
315 Err(e) => Ok(splice_response_string(&raw_result, "save_warning", &e)),
316 }
317 }
318}