Skip to main content

brainwires_eval/
lib.rs

1//! # brainwires-eval
2//!
3//! Evaluation framework for Brainwires agents.
4//!
5//! ## What's included
6//!
7//! | Module | Key type | Purpose |
8//! |--------|----------|---------|
9//! | [`trial`] | [`TrialResult`], [`EvaluationStats`] | Per-trial results + Wilson-score 95 % CI |
10//! | [`case`] | [`EvaluationCase`] | Trait for a single evaluatable scenario |
11//! | [`suite`] | [`EvaluationSuite`], [`SuiteResult`] | N-trial Monte Carlo runner |
12//! | [`recorder`] | [`ToolSequenceRecorder`], [`SequenceDiff`] | Record + diff tool call sequences |
13//! | [`adversarial`] | [`AdversarialTestCase`] | Prompt injection, ambiguity, budget stress |
14
15pub mod adversarial;
16pub mod case;
17pub mod fault_report;
18pub mod fixtures;
19pub mod ranking_metrics;
20pub mod recorder;
21pub mod regression;
22pub mod stability_tests;
23pub mod suite;
24pub mod trial;
25
26// ── Top-level re-exports ──────────────────────────────────────────────────────
27
28// Trial types
29pub use trial::{ConfidenceInterval95, EvaluationStats, TrialResult};
30
31// Case trait + built-in helpers
32pub use case::{AlwaysFailCase, AlwaysPassCase, EvaluationCase, StochasticCase};
33
34// Suite types
35pub use suite::{EvaluationSuite, SuiteConfig, SuiteResult};
36
37// Recorder
38pub use recorder::{SequenceDiff, ToolCallRecord, ToolSequenceRecorder};
39
40// Adversarial
41pub use adversarial::{AdversarialTestCase, AdversarialTestType};
42
43// Regression suite
44pub use regression::{
45    CategoryBaseline, CategoryRegressionResult, RegressionConfig, RegressionResult, RegressionSuite,
46};
47
48// Stability tests
49pub use stability_tests::{
50    GoalPreservationCase, LoopDetectionSimCase, long_horizon_stability_suite,
51};
52
53// Fault report
54pub use fault_report::{FaultKind, FaultReport, analyze_suite_for_faults};
55
56// Fixtures
57pub use fixtures::{
58    Assertion, ExpectedBehavior, Fixture, FixtureCase, FixtureMessage, FixtureRunner, RunOutcome,
59    load_fixture_file, load_fixtures_from_dir,
60};
61
62// Ranking metrics
63pub use ranking_metrics::{mrr, ndcg_at_k, precision_at_k};