Skip to main content

swink_agent_eval/
lib.rs

1// Crate-wide unsafe policy: deny by default. Every new surface added by spec
2// 043 MUST compile under this ceiling. Per FR-049, the single authorized
3// carve-out is `evaluators::code::sandbox::posix` (Unix-only, rlimit FFI),
4// which relaxes to `#![allow(unsafe_code)]` at module scope with per-block
5// `// SAFETY:` annotations. `deny` is used instead of `forbid` because
6// `forbid` cannot be relaxed by a nested `allow` — the practical effect is
7// identical: any unsafe outside the carve-out is a hard compile error.
8#![deny(unsafe_code)]
9//! Evaluation framework for swink-agent.
10//!
11//! Provides trajectory tracing, golden path verification, response matching,
12//! and cost/latency governance for LLM-powered agent loops.
13//!
14//! # Quick Start
15//!
16//! ```rust,ignore
17//! use swink_agent_eval::{EvalRunner, EvalSet, EvalCase};
18//!
19//! let set = EvalSet { id: "demo".into(), name: "Demo".into(), description: None, cases: vec![] };
20//! let runner = EvalRunner::with_defaults();
21//! let result = runner.run_set(&set, &my_factory).await?;
22//! println!("Passed: {}/{}", result.summary.passed, result.summary.total_cases);
23//! ```
24
25pub mod aggregator;
26mod audit;
27mod budget;
28pub mod cache;
29#[cfg(feature = "cli")]
30pub mod ci;
31mod efficiency;
32mod environment_state;
33mod error;
34mod evaluator;
35#[cfg(feature = "judge-core")]
36pub mod evaluators;
37mod gate;
38#[cfg(feature = "generation")]
39pub mod generation;
40pub mod judge;
41mod match_;
42#[cfg(feature = "judge-core")]
43pub mod prompt;
44pub mod report;
45mod response;
46mod runner;
47mod score;
48mod semantic_tool_parameter;
49mod semantic_tool_selection;
50#[cfg(feature = "simulation")]
51pub mod simulation;
52mod store;
53#[cfg(feature = "telemetry")]
54pub mod telemetry;
55pub mod testing;
56#[cfg(feature = "trace-ingest")]
57pub mod trace;
58#[cfg(feature = "training-export")]
59pub mod training;
60mod trajectory;
61mod types;
62mod url_filter;
63#[cfg(feature = "yaml")]
64mod yaml;
65
66// ─── Public API ─────────────────────────────────────────────────────────────
67
68pub use aggregator::{Aggregator, AllPass, AnyPass, Average, Weighted};
69pub use audit::AuditedInvocation;
70pub use budget::BudgetEvaluator;
71pub use cache::{
72    CacheKey as TaskResultCacheKey, EvaluationDataStore, FingerprintContext,
73    LocalFileTaskResultStore, StoreError, canonicalize_fingerprint, tool_set_hash,
74};
75pub use efficiency::EfficiencyEvaluator;
76pub use environment_state::EnvironmentStateEvaluator;
77pub use error::EvalError;
78pub use evaluator::{Evaluator, EvaluatorRegistry};
79#[cfg(feature = "evaluator-agent")]
80pub use evaluators::agent::{
81    AgentToneEvaluator, InteractionsEvaluator, KnowledgeRetentionEvaluator,
82    LanguageDetectionEvaluator, PerceivedErrorEvaluator, TaskCompletionEvaluator,
83    TrajectoryAccuracyEvaluator, TrajectoryAccuracyWithRefEvaluator, UserSatisfactionEvaluator,
84};
85#[cfg(feature = "evaluator-code")]
86pub use evaluators::code::llm_judge::CodeLlmJudgeEvaluator;
87#[cfg(feature = "evaluator-code")]
88pub use evaluators::code::{
89    CargoCheckEvaluator, ClippyEvaluator, CodeExtractor, CodeExtractorStrategy,
90};
91#[cfg(feature = "evaluator-sandbox")]
92pub use evaluators::code::{
93    SandboxLimits, SandboxOutcome, SandboxRunner, SandboxedExecutionEvaluator, ShellRunner,
94    run_sandboxed,
95};
96#[cfg(feature = "multimodal")]
97pub use evaluators::multimodal::ImageSafetyEvaluator;
98#[cfg(feature = "evaluator-quality")]
99pub use evaluators::quality::{
100    CoherenceEvaluator, ConcisenessEvaluator, CorrectnessEvaluator, FaithfulnessEvaluator,
101    GoalSuccessRateEvaluator, HallucinationEvaluator, HelpfulnessEvaluator, LazinessEvaluator,
102    PlanAdherenceEvaluator, ResponseRelevanceEvaluator, assertion_implies_goal_completion,
103};
104#[cfg(feature = "evaluator-rag")]
105pub use evaluators::rag::{
106    DEFAULT_EMBEDDING_SIMILARITY_THRESHOLD, Embedder, EmbedderError, EmbeddingSimilarityEvaluator,
107    RAGGroundednessEvaluator, RAGHelpfulnessEvaluator, RAGRetrievalRelevanceEvaluator,
108};
109#[cfg(feature = "evaluator-safety")]
110pub use evaluators::safety::{
111    CodeInjectionEvaluator, FairnessEvaluator, HarmfulnessEvaluator, PIIClass, PIILeakageEvaluator,
112    PromptInjectionEvaluator, ToxicityEvaluator,
113};
114#[cfg(feature = "evaluator-simple")]
115pub use evaluators::simple::{ExactMatchEvaluator, LevenshteinDistanceEvaluator};
116#[cfg(feature = "evaluator-structured")]
117pub use evaluators::structured::{JsonMatchEvaluator, JsonSchemaEvaluator, KeyStrategy};
118#[cfg(feature = "judge-core")]
119pub use evaluators::{
120    Detail, DetailBuffer, DispatchError, DispatchOutcome, EvaluatorError, JudgeEvaluatorBuilder,
121    JudgeEvaluatorConfig, dispatch_judge, drive_judge_call, evaluate_with_builtin,
122    finish_metric_result, materialize_case_attachments,
123};
124pub use gate::{GateConfig, GateResult, check_gate};
125pub use judge::{
126    CacheKey, DEFAULT_JUDGE_CACHE_CAPACITY, JudgeCache, JudgeClient, JudgeError, JudgeFuture,
127    JudgeRegistry, JudgeRegistryBuilder, JudgeRegistryError, JudgeVerdict, RetryPolicy,
128};
129pub use match_::{MatchMode, TrajectoryMatcher};
130#[cfg(feature = "judge-core")]
131pub use prompt::{
132    BUILTIN_TEMPLATE_VERSIONS, JudgePromptTemplate, MinijinjaTemplate, PromptContext, PromptError,
133    PromptFamily, PromptTemplateRegistry,
134};
135#[cfg(feature = "html-report")]
136pub use report::HtmlReporter;
137pub use report::{
138    ConsoleReporter, JsonReporter, MarkdownReporter, Reporter, ReporterError, ReporterOutput,
139};
140#[cfg(feature = "langsmith")]
141pub use report::{LangSmithExportError, LangSmithExporter};
142pub use response::ResponseMatcher;
143pub use runner::{AgentFactory, EvalRunner, RunnerMetricSample};
144pub use score::{Score, Verdict};
145pub use semantic_tool_parameter::SemanticToolParameterEvaluator;
146pub use semantic_tool_selection::SemanticToolSelectionEvaluator;
147pub use store::{EvalStore, FsEvalStore};
148#[cfg(feature = "telemetry")]
149pub use telemetry::{EvalsTelemetry, EvalsTelemetryBuilder};
150pub use testing::{MockJudge, PanickingMockJudge, SlowMockJudge};
151#[cfg(feature = "trace-langfuse")]
152pub use trace::LangfuseTraceProvider;
153#[cfg(feature = "trace-otlp")]
154pub use trace::OtlpHttpTraceProvider;
155#[cfg(feature = "training-export")]
156pub use training::{
157    ChatMlExporter, DpoExporter, ExportError, ExportOptions, ScoredTrace, ShareGptExporter,
158    TrainingExporter, TrainingFormat, TrainingReporter, export_traces,
159};
160pub use trajectory::TrajectoryCollector;
161pub use types::{
162    Assertion, AssertionKind, Attachment, AttachmentError, BudgetConstraints, CASE_NAMESPACE,
163    CaseFingerprint, EnvironmentState, EvalCase, EvalCaseResult, EvalMetricResult, EvalSet,
164    EvalSetResult, EvalSummary, ExpectedToolCall, FewShotExample, InteractionExpectation,
165    Invocation, MaterializedAttachment, RecordedToolCall, ResponseCriteria, StateCapture,
166    ToolIntent, TurnRecord, validate_eval_case, validate_eval_set,
167};
168pub use url_filter::{DefaultUrlFilter, UrlFilter};
169#[cfg(feature = "yaml")]
170pub use yaml::load_eval_set_yaml;