use std::sync::Arc;
use std::time::Duration;
use serde::{Deserialize, Serialize};
use swink_agent::{AssistantMessage, Cost, ModelSpec, StopReason, ToolResultMessage, Usage};
use crate::score::{Score, Verdict};
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RecordedToolCall {
pub id: String,
pub name: String,
pub arguments: serde_json::Value,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct TurnRecord {
pub turn_index: usize,
pub assistant_message: AssistantMessage,
pub tool_calls: Vec<RecordedToolCall>,
pub tool_results: Vec<ToolResultMessage>,
pub duration: Duration,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Invocation {
pub turns: Vec<TurnRecord>,
pub total_usage: Usage,
pub total_cost: Cost,
pub total_duration: Duration,
pub final_response: Option<String>,
pub stop_reason: StopReason,
pub model: ModelSpec,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ExpectedToolCall {
pub tool_name: String,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub arguments: Option<serde_json::Value>,
}
#[derive(Clone, Serialize, Deserialize)]
#[serde(tag = "mode", rename_all = "snake_case")]
pub enum ResponseCriteria {
Exact { expected: String },
Contains { substring: String },
Regex { pattern: String },
#[serde(skip)]
Custom(#[serde(skip)] Arc<dyn Fn(&str) -> Score + Send + Sync>),
}
impl std::fmt::Debug for ResponseCriteria {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::Exact { expected } => {
f.debug_struct("Exact").field("expected", expected).finish()
}
Self::Contains { substring } => f
.debug_struct("Contains")
.field("substring", substring)
.finish(),
Self::Regex { pattern } => f.debug_struct("Regex").field("pattern", pattern).finish(),
Self::Custom(_) => f.debug_tuple("Custom").field(&"<fn>").finish(),
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct BudgetConstraints {
#[serde(default, skip_serializing_if = "Option::is_none")]
pub max_cost: Option<f64>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub max_tokens: Option<u64>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub max_turns: Option<usize>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub max_duration: Option<Duration>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EvalCase {
pub id: String,
pub name: String,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub description: Option<String>,
pub system_prompt: String,
pub user_messages: Vec<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub expected_trajectory: Option<Vec<ExpectedToolCall>>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub expected_response: Option<ResponseCriteria>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub budget: Option<BudgetConstraints>,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub evaluators: Vec<String>,
#[serde(default, skip_serializing_if = "serde_json::Value::is_null")]
pub metadata: serde_json::Value,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EvalSet {
pub id: String,
pub name: String,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub description: Option<String>,
pub cases: Vec<EvalCase>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EvalMetricResult {
pub evaluator_name: String,
pub score: Score,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub details: Option<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EvalCaseResult {
pub case_id: String,
pub invocation: Invocation,
pub metric_results: Vec<EvalMetricResult>,
pub verdict: Verdict,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EvalSetResult {
pub eval_set_id: String,
pub case_results: Vec<EvalCaseResult>,
pub summary: EvalSummary,
pub timestamp: u64,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EvalSummary {
pub total_cases: usize,
pub passed: usize,
pub failed: usize,
pub total_cost: Cost,
pub total_usage: Usage,
pub total_duration: Duration,
}