#![forbid(unsafe_code)]
#![deny(missing_docs)]
pub mod commands;
pub mod determinism;
pub mod frameworks;
pub mod languages;
pub mod ontology;
pub mod reliability;
pub mod safety;
pub mod tokens;
pub mod vms;
pub mod web;
pub use commands::{assess_safety_script, classify_command, classify_invocation, classify_script};
pub use determinism::{assess_determinism, DeterminismReport};
pub use frameworks::{
compare_frameworks, rank_frameworks, Framework, FrameworkComparison, FrameworkProfile,
};
pub use languages::{
compare_languages, rank_languages, Language, LanguageComparison, LanguageProfile,
};
pub use ontology::{ontology, Ontology};
pub use reliability::{
assess_error_quality, assess_reliability, ErrorQuality, ErrorQualityReport, Outcome,
ReliabilityReport,
};
pub use safety::{
assess_exfiltration, assess_reversibility, assess_safety, assess_safety_named, Decision,
Effect, ExfiltrationReport, Mode, ReversibilityReport, SafetyReport,
};
pub use tokens::{
assess_cache, assess_scaling, cacheable_prefix_tokens, compare, evaluate, evaluate_with, rank,
rank_with, AgentCost, CacheReport, Comparison, Model, Program, ScalingReport,
};
pub use vms::{compare_vms, rank_vms, Vm, VmComparison, VmProfile};
pub use web::{compare_web_stacks, rank_web_stacks, WebStack, WebStackComparison, WebStackProfile};
#[cfg_attr(feature = "serde", derive(serde::Serialize))]
#[derive(Debug, Clone, Default)]
pub struct Evaluation {
pub name: String,
pub tokens: Option<tokens::AgentCost>,
pub determinism: Option<determinism::DeterminismReport>,
pub reliability: Option<reliability::ReliabilityReport>,
pub safety: Option<safety::SafetyReport>,
}
impl Evaluation {
pub fn new(name: impl Into<String>) -> Self {
Self {
name: name.into(),
..Default::default()
}
}
pub fn with_tokens(mut self, c: tokens::AgentCost) -> Self {
self.tokens = Some(c);
self
}
pub fn with_determinism(mut self, d: determinism::DeterminismReport) -> Self {
self.determinism = Some(d);
self
}
pub fn with_reliability(mut self, r: reliability::ReliabilityReport) -> Self {
self.reliability = Some(r);
self
}
pub fn with_safety(mut self, s: safety::SafetyReport) -> Self {
self.safety = Some(s);
self
}
pub fn fitness(&self) -> Option<f64> {
let mut sum = 0.0;
let mut n = 0.0;
if let Some(d) = &self.determinism {
sum += if d.deterministic { 1.0 } else { 0.0 };
n += 1.0;
}
if let Some(r) = &self.reliability {
sum += (r.pass_rate + r.actionable_rate) / 2.0;
n += 1.0;
}
if let Some(s) = &self.safety {
sum += s.score;
n += 1.0;
}
if n == 0.0 {
None
} else {
Some(sum / n)
}
}
}
impl std::fmt::Display for Evaluation {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
writeln!(f, "evaluation: {}", self.name)?;
if let Some(t) = &self.tokens {
writeln!(f, " tokens: {}", t)?;
}
if let Some(d) = &self.determinism {
writeln!(f, " determinism: {}", d)?;
}
if let Some(r) = &self.reliability {
writeln!(f, " reliability: {}", r)?;
}
if let Some(s) = &self.safety {
writeln!(f, " safety: {}", s)?;
}
match self.fitness() {
Some(score) => write!(f, " fitness: {:.2}", score),
None => write!(f, " fitness: n/a (no scorable axis measured)"),
}
}
}
#[cfg(all(test, feature = "serde"))]
mod serde_tests {
use super::*;
fn assert_serialize<T: serde::Serialize>() {}
#[test]
fn report_types_implement_serialize() {
assert_serialize::<Evaluation>();
assert_serialize::<AgentCost>();
assert_serialize::<Program>();
assert_serialize::<Comparison>();
assert_serialize::<Model>();
assert_serialize::<DeterminismReport>();
assert_serialize::<Outcome>();
assert_serialize::<ReliabilityReport>();
assert_serialize::<Effect>();
assert_serialize::<Mode>();
assert_serialize::<Decision>();
assert_serialize::<SafetyReport>();
}
}