use aethershell::builtins::{est_token_count, render_canonical};
use aethershell::env::Env;
use aethershell::eval::eval_program;
use aethershell::parser::parse_program;
use aethershell::safety::{self, SafetyError};
use aethershell::value::Value;
use agentic_eval::determinism::assess_determinism;
use agentic_eval::reliability::{assess_reliability, Outcome};
use agentic_eval::safety::{assess_safety_named, Effect, Mode};
use agentic_eval::tokens::{evaluate_with, Program};
fn eval_to_value(code: &str) -> anyhow::Result<Value> {
let stmts = parse_program(code)?;
let mut env = Env::new();
eval_program(&stmts, &mut env)
}
#[test]
fn aethershell_token_surface_is_competitive_over_a_session() {
let count = est_token_count;
let legible = evaluate_with(
&Program::new("legible", r#"ls(".") | where(fn(f) => f.size > 1000)"#)
.with_standing_context("ls/where/map are standard names"),
count,
);
let cipher = evaluate_with(
&Program::new("cipher", r#"l.|w~.size>1k"#)
.with_standing_context(aethershell::transpile::agentic::describe_ontology())
.with_retries(1),
count,
);
assert!(
legible.total_over(30) < cipher.total_over(30),
"legible {} should beat cipher {} over 30 turns",
legible.total_over(30),
cipher.total_over(30)
);
}
#[test]
fn aethershell_canonical_render_is_deterministic() {
let det = assess_determinism(6, || {
let v = eval_to_value(r#"{ b: 2.0, a: 1, items: [3, 1, 2] }"#).expect("eval");
render_canonical(&v).unwrap_or_default()
});
assert!(det.deterministic, "canonical render must be byte-stable");
assert_eq!(det.distinct, 1);
assert!(
det.first.starts_with(r#"{"a":1,"b":2"#),
"got: {}",
det.first
);
}
#[test]
fn aethershell_is_reliable_with_actionable_failures() {
let programs = [
r#"len([1, 2, 3])"#,
r#"upper("hi")"#,
r#"[1, 2, 3] | map(fn(x) => x + 1)"#,
r#"env(123)"#, r#"((("#, ];
let rel = assess_reliability(&programs, |code| match eval_to_value(code) {
Ok(_) => Outcome::ok(),
Err(e) if e.downcast_ref::<SafetyError>().is_some() => Outcome::structured_failure(),
Err(_) => Outcome::opaque_failure(),
});
assert_eq!(rel.passed, 3, "the three valid programs run");
assert_eq!(rel.structured_failures, 1);
assert!(
(rel.actionable_rate - 0.8).abs() < 1e-9,
"got {}",
rel.actionable_rate
);
}
#[test]
fn aethershell_bounds_dangerous_builtin_blast_radius_in_agent_mode() {
let builtins = [
"len",
"file_read",
"file_write",
"http_get",
"proc_kill",
"rm",
"sh",
];
let report = assess_safety_named(
&builtins,
|b| Effect::from_name(safety::effect_of(b).as_str()),
Mode::Agent,
);
assert_eq!(
report.effects,
builtins.len(),
"every builtin maps to an effect"
);
assert!(
report.bounded,
"no dangerous builtin runs ungated in agent mode"
);
assert_eq!(report.dangerous_ungated, 0);
assert_eq!(report.score, 1.0);
assert_eq!(report.grade, 'A');
assert_eq!(report.approval_gated, 3);
}
#[test]
fn agentic_eval_policy_stays_in_sync_with_aethershell() {
use aethershell::safety as ae;
use agentic_eval::safety as ag;
std::env::remove_var("AETHER_POLICY");
let label_ae = |d: ae::Decision| match d {
ae::Decision::Allow => "allow",
ae::Decision::Approve => "approve",
ae::Decision::Deny => "deny",
};
let label_ag = |d: ag::Decision| match d {
ag::Decision::Allow => "allow",
ag::Decision::Approve => "approve",
ag::Decision::Deny => "deny",
};
let effects = [
(ae::Effect::Pure, ag::Effect::Pure),
(ae::Effect::ReadLocal, ag::Effect::ReadLocal),
(ae::Effect::WriteLocal, ag::Effect::WriteLocal),
(ae::Effect::Network, ag::Effect::Network),
(ae::Effect::Process, ag::Effect::Process),
(ae::Effect::Destructive, ag::Effect::Destructive),
(ae::Effect::Exec, ag::Effect::Exec),
(ae::Effect::Privileged, ag::Effect::Privileged),
];
let modes = [
(ae::Mode::Human, ag::Mode::Human),
(ae::Mode::Agent, ag::Mode::Agent),
];
for (ae_e, ag_e) in effects {
for (ae_m, ag_m) in modes {
assert_eq!(
label_ae(ae::decide(ae_e, ae_m)),
label_ag(ag::decide(ag_e, ag_m)),
"policy drift for effect {} in this mode",
ag_e.name()
);
}
}
}