use super::eval_store::{
escape_for_lua_sq, evals_dir, extract_strategy_from_id, list_eval_history, save_compare_result,
save_eval_result,
};
use super::path::ContainedPath;
use super::resolve::{is_package_installed, resolve_scenario_code};
use super::AppService;
const STD_SHIM: &str = r#"
std = {
json = {
decode = alc.json_decode,
encode = alc.json_encode,
},
fs = {
read = function(path)
local f, err = io.open(path, "r")
if not f then error("std.fs.read: " .. (err or path), 2) end
local content = f:read("*a")
f:close()
return content
end,
is_file = function(path)
local f = io.open(path, "r")
if f then f:close(); return true end
return false
end,
},
time = {
now = alc.time,
},
}
"#;
impl AppService {
pub async fn eval(
&self,
scenario: Option<String>,
scenario_file: Option<String>,
scenario_name: Option<String>,
strategy: &str,
strategy_opts: Option<serde_json::Value>,
auto_card: bool,
) -> Result<String, String> {
if !is_package_installed("evalframe") {
self.auto_install_bundled_packages().await?;
if !is_package_installed("evalframe") {
return Err(
"Package 'evalframe' not found after installing bundled collection. \
Use alc_pkg_install to install it manually."
.into(),
);
}
}
let scenario_code = resolve_scenario_code(scenario, scenario_file, scenario_name.clone())?;
let opts_lua = match &strategy_opts {
Some(v) if !v.is_null() => format!("alc.json_decode('{}')", v),
_ => "nil".to_string(),
};
let auto_card_lua = if auto_card { "true" } else { "false" };
let wrapped = format!(
r#"{std_shim}
local scenario = (function()
{scenario_code}
end)()
return alc.eval(scenario, "{strategy}", {{
strategy_opts = {opts_lua},
auto_card = {auto_card_lua},
}})
"#,
std_shim = STD_SHIM,
);
let ctx = serde_json::Value::Null;
let result = self
.start_and_tick(wrapped, ctx, Some(strategy), vec![])
.await?;
if let Ok(parsed) = serde_json::from_str::<serde_json::Value>(&result) {
match parsed.get("status").and_then(|s| s.as_str()) {
Some("completed") => {
save_eval_result(strategy, &result);
}
Some("needs_response") => {
if let Some(sid) = parsed.get("session_id").and_then(|s| s.as_str()) {
if let Ok(mut map) = self.eval_sessions.lock() {
map.insert(sid.to_string(), strategy.to_string());
}
}
}
_ => {}
}
}
Ok(result)
}
pub fn eval_history(&self, strategy: Option<&str>, limit: usize) -> Result<String, String> {
let dir = evals_dir()?;
list_eval_history(&dir, strategy, limit)
}
pub fn eval_detail(&self, eval_id: &str) -> Result<String, String> {
let evals_dir = evals_dir()?;
let path = ContainedPath::child(&evals_dir, &format!("{eval_id}.json"))
.map_err(|e| format!("Invalid eval_id: {e}"))?;
if !path.exists() {
return Err(format!("Eval result not found: {eval_id}"));
}
std::fs::read_to_string(&*path).map_err(|e| format!("Failed to read eval: {e}"))
}
pub async fn eval_compare(&self, eval_id_a: &str, eval_id_b: &str) -> Result<String, String> {
let cache_filename = format!("compare_{eval_id_a}_vs_{eval_id_b}.json");
if let Ok(dir) = evals_dir() {
if let Ok(cached_path) = ContainedPath::child(&dir, &cache_filename) {
if cached_path.exists() {
return std::fs::read_to_string(&*cached_path)
.map_err(|e| format!("Failed to read cached comparison: {e}"));
}
}
}
if !is_package_installed("evalframe") {
self.auto_install_bundled_packages().await?;
if !is_package_installed("evalframe") {
return Err(
"Package 'evalframe' not found after installing bundled collection. \
Use alc_pkg_install to install it manually."
.into(),
);
}
}
let result_a = self.eval_detail(eval_id_a)?;
let result_b = self.eval_detail(eval_id_b)?;
let lua_code = format!(
r#"{std_shim}
local stats = require("evalframe.eval.stats")
local result_a = alc.json_decode('{result_a_escaped}')
local result_b = alc.json_decode('{result_b_escaped}')
local agg_a = result_a.result and result_a.result.aggregated
local agg_b = result_b.result and result_b.result.aggregated
if not agg_a or not agg_a.scores then
error("No aggregated scores in {eval_id_a}")
end
if not agg_b or not agg_b.scores then
error("No aggregated scores in {eval_id_b}")
end
local welch = stats.welch_t(agg_a.scores, agg_b.scores)
local strategy_a = (result_a.result and result_a.result.name) or "{strategy_a_fallback}"
local strategy_b = (result_b.result and result_b.result.name) or "{strategy_b_fallback}"
local delta = agg_a.scores.mean - agg_b.scores.mean
local winner = "none"
if welch.significant then
winner = delta > 0 and "a" or "b"
end
-- Build summary text
local parts = {{}}
if welch.significant then
local w, l, d = strategy_a, strategy_b, delta
if delta < 0 then w, l, d = strategy_b, strategy_a, -delta end
parts[#parts + 1] = string.format(
"%s outperforms %s by %.4f (mean score), statistically significant (t=%.3f, df=%.1f).",
w, l, d, math.abs(welch.t_stat), welch.df
)
else
parts[#parts + 1] = string.format(
"No statistically significant difference between %s and %s (t=%.3f, df=%.1f).",
strategy_a, strategy_b, math.abs(welch.t_stat), welch.df
)
end
if agg_a.pass_rate and agg_b.pass_rate then
local dp = agg_a.pass_rate - agg_b.pass_rate
if math.abs(dp) > 1e-9 then
local h = dp > 0 and strategy_a or strategy_b
parts[#parts + 1] = string.format("Pass rate: %s +%.1fpp.", h, math.abs(dp) * 100)
else
parts[#parts + 1] = string.format("Pass rate: identical (%.1f%%).", agg_a.pass_rate * 100)
end
end
return {{
a = {{
eval_id = "{eval_id_a}",
strategy = strategy_a,
scores = agg_a.scores,
pass_rate = agg_a.pass_rate,
pass_at_1 = agg_a.pass_at_1,
ci_95 = agg_a.ci_95,
}},
b = {{
eval_id = "{eval_id_b}",
strategy = strategy_b,
scores = agg_b.scores,
pass_rate = agg_b.pass_rate,
pass_at_1 = agg_b.pass_at_1,
ci_95 = agg_b.ci_95,
}},
comparison = {{
delta_mean = delta,
welch_t = {{
t_stat = welch.t_stat,
df = welch.df,
significant = welch.significant,
direction = welch.direction,
}},
winner = winner,
summary = table.concat(parts, " "),
}},
}}
"#,
result_a_escaped = escape_for_lua_sq(&result_a),
result_b_escaped = escape_for_lua_sq(&result_b),
eval_id_a = eval_id_a,
eval_id_b = eval_id_b,
std_shim = STD_SHIM,
strategy_a_fallback = extract_strategy_from_id(eval_id_a).unwrap_or("A"),
strategy_b_fallback = extract_strategy_from_id(eval_id_b).unwrap_or("B"),
);
let ctx = serde_json::Value::Null;
let raw_result = self.start_and_tick(lua_code, ctx, None, vec![]).await?;
save_compare_result(eval_id_a, eval_id_b, &raw_result);
Ok(raw_result)
}
}