algocline-app 0.15.0

use super::eval_store::{
    escape_for_lua_sq, evals_dir, extract_strategy_from_id, list_eval_history, save_compare_result,
    save_eval_result,
};
use super::path::ContainedPath;
use super::resolve::{is_package_installed, resolve_scenario_code};
use super::AppService;

impl AppService {
    /// Run an evalframe evaluation suite.
    ///
    /// Accepts a scenario (bindings + cases) and a strategy name.
    /// Automatically wires the strategy as the provider and executes
    /// the evalframe suite, returning the report (summary, scores, failures).
    ///
    /// Injects a `std` global (mlua-batteries compatible shim) so evalframe's
    /// `std.lua` can resolve json/fs/time from algocline's built-in primitives.
    ///
    /// # Security: `strategy` is not sanitized
    ///
    /// `strategy` is interpolated into a Lua string literal without escaping.
    /// This is intentional — same rationale as [`make_require_code`]:
    /// algocline runs Lua in the caller's own process with full ambient
    /// authority, so Lua injection does not cross a trust boundary.
    pub async fn eval(
        &self,
        scenario: Option<String>,
        scenario_file: Option<String>,
        scenario_name: Option<String>,
        strategy: &str,
        strategy_opts: Option<serde_json::Value>,
    ) -> Result<String, String> {
        // Auto-install bundled packages if evalframe is missing
        if !is_package_installed("evalframe") {
            self.auto_install_bundled_packages().await?;
            if !is_package_installed("evalframe") {
                return Err(
                    "Package 'evalframe' not found after installing bundled collection. \
                     Use alc_pkg_install to install it manually."
                        .into(),
                );
            }
        }

        let scenario_code = resolve_scenario_code(scenario, scenario_file, scenario_name)?;

        // Build strategy opts Lua table literal
        let opts_lua = match &strategy_opts {
            Some(v) if !v.is_null() => format!("alc.json_decode('{}')", v),
            _ => "{}".to_string(),
        };

        // Inject `std` global as a mlua-batteries compatible shim.
        //
        // evalframe.std expects the host to provide a `std` global with:
        //   std.json.decode/encode  — JSON serialization
        //   std.fs.read/is_file     — filesystem access
        //   std.time.now            — wall-clock time (epoch seconds, f64)
        //
        // We bridge these from algocline's alc.* primitives and Lua's io stdlib.
        let wrapped = format!(
            r#"
std = {{
  json = {{
    decode = alc.json_decode,
    encode = alc.json_encode,
  }},
  fs = {{
    read = function(path)
      local f, err = io.open(path, "r")
      if not f then error("std.fs.read: " .. (err or path), 2) end
      local content = f:read("*a")
      f:close()
      return content
    end,
    is_file = function(path)
      local f = io.open(path, "r")
      if f then f:close(); return true end
      return false
    end,
  }},
  time = {{
    now = alc.time,
  }},
}}

local ef = require("evalframe")

-- Load scenario (bindings + cases, no provider)
local spec = (function()
{scenario_code}
end)()

-- Inject strategy as provider
spec.provider = ef.providers.algocline {{
  strategy = "{strategy}",
  opts = {opts_lua},
}}

-- Build and run suite
local s = ef.suite "eval" (spec)
local report = s:run()
return report:to_table()
"#
        );

        let ctx = serde_json::Value::Null;
        let result = self
            .start_and_tick(wrapped, ctx, Some(strategy), vec![])
            .await?;

        // Register this session for eval result saving on completion.
        // start_and_tick returns the first pause (needs_response) or completed.
        // If completed immediately, save now. Otherwise, save when continue_* finishes.
        if let Ok(parsed) = serde_json::from_str::<serde_json::Value>(&result) {
            match parsed.get("status").and_then(|s| s.as_str()) {
                Some("completed") => {
                    save_eval_result(strategy, &result);
                }
                Some("needs_response") => {
                    if let Some(sid) = parsed.get("session_id").and_then(|s| s.as_str()) {
                        if let Ok(mut map) = self.eval_sessions.lock() {
                            map.insert(sid.to_string(), strategy.to_string());
                        }
                    }
                }
                _ => {}
            }
        }

        Ok(result)
    }

    /// List eval history, optionally filtered by strategy.
    pub fn eval_history(&self, strategy: Option<&str>, limit: usize) -> Result<String, String> {
        let dir = evals_dir()?;
        list_eval_history(&dir, strategy, limit)
    }

    /// View a specific eval result by ID.
    pub fn eval_detail(&self, eval_id: &str) -> Result<String, String> {
        let evals_dir = evals_dir()?;
        let path = ContainedPath::child(&evals_dir, &format!("{eval_id}.json"))
            .map_err(|e| format!("Invalid eval_id: {e}"))?;
        if !path.exists() {
            return Err(format!("Eval result not found: {eval_id}"));
        }
        std::fs::read_to_string(&*path).map_err(|e| format!("Failed to read eval: {e}"))
    }

    /// Compare two eval results with statistical significance testing.
    ///
    /// Delegates to evalframe's `stats.welch_t` (single source of truth for
    /// t-distribution table and test logic). Reads persisted `aggregated.scores`
    /// from each eval result — no re-computation of descriptive statistics.
    ///
    /// The comparison result is persisted to `~/.algocline/evals/` so repeated
    /// lookups of the same pair are file reads only.
    pub async fn eval_compare(&self, eval_id_a: &str, eval_id_b: &str) -> Result<String, String> {
        // Check for cached comparison
        let cache_filename = format!("compare_{eval_id_a}_vs_{eval_id_b}.json");
        if let Ok(dir) = evals_dir() {
            if let Ok(cached_path) = ContainedPath::child(&dir, &cache_filename) {
                if cached_path.exists() {
                    return std::fs::read_to_string(&*cached_path)
                        .map_err(|e| format!("Failed to read cached comparison: {e}"));
                }
            }
        }

        // Auto-install bundled packages if evalframe is missing
        if !is_package_installed("evalframe") {
            self.auto_install_bundled_packages().await?;
            if !is_package_installed("evalframe") {
                return Err(
                    "Package 'evalframe' not found after installing bundled collection. \
                     Use alc_pkg_install to install it manually."
                        .into(),
                );
            }
        }

        let result_a = self.eval_detail(eval_id_a)?;
        let result_b = self.eval_detail(eval_id_b)?;

        // Build Lua snippet that uses evalframe's stats module
        // to compute welch_t from the persisted aggregated scores.
        let lua_code = format!(
            r#"
std = {{
  json = {{
    decode = alc.json_decode,
    encode = alc.json_encode,
  }},
  fs = {{ read = function() end, is_file = function() return false end }},
  time = {{ now = alc.time }},
}}

local stats = require("evalframe.eval.stats")

local result_a = alc.json_decode('{result_a_escaped}')
local result_b = alc.json_decode('{result_b_escaped}')

local agg_a = result_a.result and result_a.result.aggregated
local agg_b = result_b.result and result_b.result.aggregated

if not agg_a or not agg_a.scores then
  error("No aggregated scores in {eval_id_a}")
end
if not agg_b or not agg_b.scores then
  error("No aggregated scores in {eval_id_b}")
end

local welch = stats.welch_t(agg_a.scores, agg_b.scores)

local strategy_a = (result_a.result and result_a.result.name) or "{strategy_a_fallback}"
local strategy_b = (result_b.result and result_b.result.name) or "{strategy_b_fallback}"

local delta = agg_a.scores.mean - agg_b.scores.mean
local winner = "none"
if welch.significant then
  winner = delta > 0 and "a" or "b"
end

-- Build summary text
local parts = {{}}
if welch.significant then
  local w, l, d = strategy_a, strategy_b, delta
  if delta < 0 then w, l, d = strategy_b, strategy_a, -delta end
  parts[#parts + 1] = string.format(
    "%s outperforms %s by %.4f (mean score), statistically significant (t=%.3f, df=%.1f).",
    w, l, d, math.abs(welch.t_stat), welch.df
  )
else
  parts[#parts + 1] = string.format(
    "No statistically significant difference between %s and %s (t=%.3f, df=%.1f).",
    strategy_a, strategy_b, math.abs(welch.t_stat), welch.df
  )
end
if agg_a.pass_rate and agg_b.pass_rate then
  local dp = agg_a.pass_rate - agg_b.pass_rate
  if math.abs(dp) > 1e-9 then
    local h = dp > 0 and strategy_a or strategy_b
    parts[#parts + 1] = string.format("Pass rate: %s +%.1fpp.", h, math.abs(dp) * 100)
  else
    parts[#parts + 1] = string.format("Pass rate: identical (%.1f%%).", agg_a.pass_rate * 100)
  end
end

return {{
  a = {{
    eval_id = "{eval_id_a}",
    strategy = strategy_a,
    scores = agg_a.scores,
    pass_rate = agg_a.pass_rate,
    pass_at_1 = agg_a.pass_at_1,
    ci_95 = agg_a.ci_95,
  }},
  b = {{
    eval_id = "{eval_id_b}",
    strategy = strategy_b,
    scores = agg_b.scores,
    pass_rate = agg_b.pass_rate,
    pass_at_1 = agg_b.pass_at_1,
    ci_95 = agg_b.ci_95,
  }},
  comparison = {{
    delta_mean = delta,
    welch_t = {{
      t_stat = welch.t_stat,
      df = welch.df,
      significant = welch.significant,
      direction = welch.direction,
    }},
    winner = winner,
    summary = table.concat(parts, " "),
  }},
}}
"#,
            result_a_escaped = escape_for_lua_sq(&result_a),
            result_b_escaped = escape_for_lua_sq(&result_b),
            eval_id_a = eval_id_a,
            eval_id_b = eval_id_b,
            strategy_a_fallback = extract_strategy_from_id(eval_id_a).unwrap_or("A"),
            strategy_b_fallback = extract_strategy_from_id(eval_id_b).unwrap_or("B"),
        );

        let ctx = serde_json::Value::Null;
        let raw_result = self.start_and_tick(lua_code, ctx, None, vec![]).await?;

        // Persist comparison result
        save_compare_result(eval_id_a, eval_id_b, &raw_result);

        Ok(raw_result)
    }
}