algocline_app/service/
eval.rs

1use std::collections::HashMap;
2use std::sync::Arc;
3
4use algocline_core::pkg::PkgType;
5
6use super::eval_store::{
7    escape_for_lua_sq, evals_dir, extract_strategy_from_id, list_eval_history, save_compare_result,
8    save_eval_result, splice_response_string,
9};
10use super::path::ContainedPath;
11use super::resolve::{is_package_installed, resolve_scenario_code};
12use super::run::normalize_stringified_json_object;
13use super::AppService;
14
15/// Lua shim that bridges algocline's `alc.*` primitives to the `std` global
16/// expected by evalframe.std. Injected once before any evalframe code runs.
17const STD_SHIM: &str = r#"
18std = {
19  json = {
20    decode = alc.json_decode,
21    encode = alc.json_encode,
22  },
23  fs = {
24    read = function(path)
25      local f, err = io.open(path, "r")
26      if not f then error("std.fs.read: " .. (err or path), 2) end
27      local content = f:read("*a")
28      f:close()
29      return content
30    end,
31    is_file = function(path)
32      local f = io.open(path, "r")
33      if f then f:close(); return true end
34      return false
35    end,
36  },
37  time = {
38    now = alc.time,
39  },
40}
41"#;
42
43impl AppService {
44    /// Run an evalframe evaluation suite via `alc.eval()`.
45    ///
46    /// Resolves the scenario from one of three input modes (inline/file/name),
47    /// injects the `std` global shim, and delegates to `alc.eval()` in prelude
48    /// which handles evalframe loading, provider wiring, and optional Card
49    /// emission.
50    ///
51    /// # Security: `strategy` is not sanitized
52    ///
53    /// `strategy` is interpolated into a Lua string literal without escaping.
54    /// This is intentional — algocline runs Lua in the caller's own process
55    /// with full ambient authority, so Lua injection does not cross a trust
56    /// boundary.
57    pub async fn eval(
58        &self,
59        scenario: Option<String>,
60        scenario_file: Option<String>,
61        scenario_name: Option<String>,
62        strategy: &str,
63        strategy_opts: Option<serde_json::Value>,
64        auto_card: bool,
65    ) -> Result<String, String> {
66        // Auto-install bundled packages if evalframe is missing
67        let app_dir = self.log_config.app_dir();
68        if !is_package_installed(&app_dir, "evalframe") {
69            self.auto_install_bundled_packages().await?;
70            if !is_package_installed(&app_dir, "evalframe") {
71                return Err(
72                    "Package 'evalframe' not found after installing bundled collection. \
73                     Use alc_pkg_install to install it manually."
74                        .into(),
75                );
76            }
77        }
78
79        // Guard: reject library packages before start_and_tick (= any LLM call)
80        if let Some((PkgType::Library, _)) = self.resolve_pkg_type_lua(strategy).await? {
81            return Err(format!(
82                "Package '{strategy}' is a library package (type = \"library\"). \
83                 Library packages cannot be evaluated as strategies. \
84                 Use a runnable package instead."
85            ));
86        }
87
88        let scenario_code =
89            resolve_scenario_code(&app_dir, scenario, scenario_file, scenario_name.clone())?;
90
91        // Build strategy opts Lua table literal
92        let strategy_opts = strategy_opts.map(normalize_stringified_json_object);
93        let opts_lua = match &strategy_opts {
94            Some(v) if !v.is_null() => format!("alc.json_decode('{}')", v),
95            _ => "nil".to_string(),
96        };
97
98        let auto_card_lua = if auto_card { "true" } else { "false" };
99
100        // Delegate to alc.eval() in prelude.
101        // The shim injects `std` for evalframe, then the scenario code is
102        // evaluated into a table and passed to alc.eval() along with opts.
103        let wrapped = format!(
104            r#"{std_shim}
105
106local scenario = (function()
107{scenario_code}
108end)()
109
110return alc.eval(scenario, "{strategy}", {{
111  strategy_opts = {opts_lua},
112  auto_card = {auto_card_lua},
113}})
114"#,
115            std_shim = STD_SHIM,
116        );
117
118        let ctx = serde_json::Value::Null;
119        // eval path does not accept ctx.env; pass an empty map so alc.env is
120        // present but empty (no env vars visible to eval strategies).
121        let env_map = Arc::new(HashMap::new());
122        let result = self
123            .start_and_tick(env_map, wrapped, ctx, Some(strategy), vec![], vec![])
124            .await?;
125
126        // Persist eval result for history/comparison.
127        // Card emission is handled by alc.eval() Lua-side when auto_card=true.
128        let mut save_warning: Option<String> = None;
129        if let Ok(parsed) = serde_json::from_str::<serde_json::Value>(&result) {
130            match parsed.get("status").and_then(|s| s.as_str()) {
131                Some("completed") => {
132                    if let Err(e) = save_eval_result(&app_dir, strategy, &result) {
133                        save_warning = Some(e);
134                    }
135                }
136                Some("needs_response") => {
137                    if let Some(sid) = parsed.get("session_id").and_then(|s| s.as_str()) {
138                        if let Ok(mut map) = self.eval_sessions.lock() {
139                            map.insert(sid.to_string(), strategy.to_string());
140                        }
141                    }
142                }
143                _ => {}
144            }
145        }
146
147        match save_warning {
148            Some(msg) => Ok(splice_response_string(&result, "save_warning", &msg)),
149            None => Ok(result),
150        }
151    }
152
153    /// List eval history, optionally filtered by strategy.
154    pub fn eval_history(&self, strategy: Option<&str>, limit: usize) -> Result<String, String> {
155        let dir = evals_dir(&self.log_config.app_dir());
156        list_eval_history(&dir, strategy, limit)
157    }
158
159    /// View a specific eval result by ID.
160    pub fn eval_detail(&self, eval_id: &str) -> Result<String, String> {
161        let evals_dir = evals_dir(&self.log_config.app_dir());
162        let path = ContainedPath::child(&evals_dir, &format!("{eval_id}.json"))
163            .map_err(|e| format!("Invalid eval_id: {e}"))?;
164        if !path.exists() {
165            return Err(format!("Eval result not found: {eval_id}"));
166        }
167        std::fs::read_to_string(&*path).map_err(|e| format!("Failed to read eval: {e}"))
168    }
169
170    /// Compare two eval results with statistical significance testing.
171    ///
172    /// Delegates to evalframe's `stats.welch_t` (single source of truth for
173    /// t-distribution table and test logic). Reads persisted `aggregated.scores`
174    /// from each eval result — no re-computation of descriptive statistics.
175    ///
176    /// The comparison result is persisted to `~/.algocline/evals/` so repeated
177    /// lookups of the same pair are file reads only.
178    pub async fn eval_compare(&self, eval_id_a: &str, eval_id_b: &str) -> Result<String, String> {
179        let app_dir = self.log_config.app_dir();
180        // Check for cached comparison
181        let cache_filename = format!("compare_{eval_id_a}_vs_{eval_id_b}.json");
182        {
183            let dir = evals_dir(&app_dir);
184            if let Ok(cached_path) = ContainedPath::child(&dir, &cache_filename) {
185                if cached_path.exists() {
186                    return std::fs::read_to_string(&*cached_path)
187                        .map_err(|e| format!("Failed to read cached comparison: {e}"));
188                }
189            }
190        }
191
192        // Auto-install bundled packages if evalframe is missing
193        if !is_package_installed(&app_dir, "evalframe") {
194            self.auto_install_bundled_packages().await?;
195            if !is_package_installed(&app_dir, "evalframe") {
196                return Err(
197                    "Package 'evalframe' not found after installing bundled collection. \
198                     Use alc_pkg_install to install it manually."
199                        .into(),
200                );
201            }
202        }
203
204        let result_a = self.eval_detail(eval_id_a)?;
205        let result_b = self.eval_detail(eval_id_b)?;
206
207        // Build Lua snippet that uses evalframe's stats module
208        // to compute welch_t from the persisted aggregated scores.
209        let lua_code = format!(
210            r#"{std_shim}
211
212local stats = require("evalframe.eval.stats")
213
214local result_a = alc.json_decode('{result_a_escaped}')
215local result_b = alc.json_decode('{result_b_escaped}')
216
217local agg_a = result_a.result and result_a.result.aggregated
218local agg_b = result_b.result and result_b.result.aggregated
219
220if not agg_a or not agg_a.scores then
221  error("No aggregated scores in {eval_id_a}")
222end
223if not agg_b or not agg_b.scores then
224  error("No aggregated scores in {eval_id_b}")
225end
226
227local welch = stats.welch_t(agg_a.scores, agg_b.scores)
228
229local strategy_a = (result_a.result and result_a.result.name) or "{strategy_a_fallback}"
230local strategy_b = (result_b.result and result_b.result.name) or "{strategy_b_fallback}"
231
232local delta = agg_a.scores.mean - agg_b.scores.mean
233local winner = "none"
234if welch.significant then
235  winner = delta > 0 and "a" or "b"
236end
237
238-- Build summary text
239local parts = {{}}
240if welch.significant then
241  local w, l, d = strategy_a, strategy_b, delta
242  if delta < 0 then w, l, d = strategy_b, strategy_a, -delta end
243  parts[#parts + 1] = string.format(
244    "%s outperforms %s by %.4f (mean score), statistically significant (t=%.3f, df=%.1f).",
245    w, l, d, math.abs(welch.t_stat), welch.df
246  )
247else
248  parts[#parts + 1] = string.format(
249    "No statistically significant difference between %s and %s (t=%.3f, df=%.1f).",
250    strategy_a, strategy_b, math.abs(welch.t_stat), welch.df
251  )
252end
253if agg_a.pass_rate and agg_b.pass_rate then
254  local dp = agg_a.pass_rate - agg_b.pass_rate
255  if math.abs(dp) > 1e-9 then
256    local h = dp > 0 and strategy_a or strategy_b
257    parts[#parts + 1] = string.format("Pass rate: %s +%.1fpp.", h, math.abs(dp) * 100)
258  else
259    parts[#parts + 1] = string.format("Pass rate: identical (%.1f%%).", agg_a.pass_rate * 100)
260  end
261end
262
263return {{
264  a = {{
265    eval_id = "{eval_id_a}",
266    strategy = strategy_a,
267    scores = agg_a.scores,
268    pass_rate = agg_a.pass_rate,
269    pass_at_1 = agg_a.pass_at_1,
270    ci_95 = agg_a.ci_95,
271  }},
272  b = {{
273    eval_id = "{eval_id_b}",
274    strategy = strategy_b,
275    scores = agg_b.scores,
276    pass_rate = agg_b.pass_rate,
277    pass_at_1 = agg_b.pass_at_1,
278    ci_95 = agg_b.ci_95,
279  }},
280  comparison = {{
281    delta_mean = delta,
282    welch_t = {{
283      t_stat = welch.t_stat,
284      df = welch.df,
285      significant = welch.significant,
286      direction = welch.direction,
287    }},
288    winner = winner,
289    summary = table.concat(parts, " "),
290  }},
291}}
292"#,
293            result_a_escaped = escape_for_lua_sq(&result_a),
294            result_b_escaped = escape_for_lua_sq(&result_b),
295            eval_id_a = eval_id_a,
296            eval_id_b = eval_id_b,
297            std_shim = STD_SHIM,
298            strategy_a_fallback = extract_strategy_from_id(eval_id_a).unwrap_or("A"),
299            strategy_b_fallback = extract_strategy_from_id(eval_id_b).unwrap_or("B"),
300        );
301
302        let ctx = serde_json::Value::Null;
303        // compare path does not accept ctx.env; pass an empty map so alc.env is
304        // present but empty (no env vars visible to compare strategies).
305        let env_map = Arc::new(HashMap::new());
306        let raw_result = self
307            .start_and_tick(env_map, lua_code, ctx, None, vec![], vec![])
308            .await?;
309
310        // Persist comparison result. Storage failure surfaces as an
311        // additive `save_warning` field on the response — the comparison
312        // itself ran to completion and remains valid in memory.
313        match save_compare_result(&app_dir, eval_id_a, eval_id_b, &raw_result) {
314            Ok(()) => Ok(raw_result),
315            Err(e) => Ok(splice_response_string(&raw_result, "save_warning", &e)),
316        }
317    }
318}
algocline_app/service/eval.rs

algocline_app/service/
eval.rs