pub struct EvalRunner { /* private fields */ }Expand description
Orchestrates evaluation: runs agents, captures trajectories, and scores
results. Default: sequential, num_runs=1, no cache, no cancellation.
Implementations§
Source§impl EvalRunner
impl EvalRunner
Sourcepub fn new(registry: EvaluatorRegistry) -> Self
pub fn new(registry: EvaluatorRegistry) -> Self
Create a runner with a custom evaluator registry.
Sourcepub fn with_defaults() -> Self
pub fn with_defaults() -> Self
Create a runner pre-loaded with built-in evaluators.
Sourcepub fn with_parallelism(self, n: usize) -> Self
pub fn with_parallelism(self, n: usize) -> Self
Sourcepub fn with_num_runs(self, n: u32) -> Self
pub fn with_num_runs(self, n: u32) -> Self
Sourcepub fn with_cache(self, store: Arc<dyn EvaluationDataStore>) -> Self
pub fn with_cache(self, store: Arc<dyn EvaluationDataStore>) -> Self
Attach a pluggable EvaluationDataStore for cached invocations
(FR-038).
Sourcepub fn with_cancellation(self, token: CancellationToken) -> Self
pub fn with_cancellation(self, token: CancellationToken) -> Self
Attach a CancellationToken honored at every await point (FR-040).
Sourcepub fn with_initial_session_file(self, path: PathBuf) -> Self
pub fn with_initial_session_file(self, path: PathBuf) -> Self
Load the given JSON file as an initial SessionState before each
case (FR-039 / R-023). Missing / malformed files surface as
EvalError::InvalidCase — never a panic.
Sourcepub fn with_telemetry(self, telemetry: Arc<EvalsTelemetry>) -> Self
pub fn with_telemetry(self, telemetry: Arc<EvalsTelemetry>) -> Self
Attach an EvalsTelemetry (spec 043 US7 / FR-035). When present,
Self::run_set emits the three-level span tree
swink.eval.run_set → swink.eval.case → swink.eval.evaluator.
Sourcepub fn agent_invocation_count(&self) -> usize
pub fn agent_invocation_count(&self) -> usize
Number of times an agent was actually invoked (cache miss count).
Sourcepub fn reset_agent_invocation_count(&self)
pub fn reset_agent_invocation_count(&self)
Reset the agent-invocation counter to zero.
Sourcepub async fn run_case(
&self,
case: &EvalCase,
factory: &dyn AgentFactory,
) -> Result<EvalCaseResult, EvalError>
pub async fn run_case( &self, case: &EvalCase, factory: &dyn AgentFactory, ) -> Result<EvalCaseResult, EvalError>
Run a single eval case and return the scored result.
Sourcepub async fn run_set(
&self,
eval_set: &EvalSet,
factory: &dyn AgentFactory,
) -> Result<EvalSetResult, EvalError>
pub async fn run_set( &self, eval_set: &EvalSet, factory: &dyn AgentFactory, ) -> Result<EvalSetResult, EvalError>
Run an entire eval set and return aggregated results.