Skip to main content

ai_agents_eval/
evidence.rs

1use std::collections::HashMap;
2
3use ai_agents_observability::ObservabilityReport;
4use ai_agents_runtime::RuntimeAgent;
5use chrono::{DateTime, Utc};
6use serde::{Deserialize, Serialize};
7use serde_json::Value;
8
9use crate::fixtures::RecordingToolLog;
10
11/// Source category for a recorded tool execution.
12#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
13#[serde(rename_all = "snake_case")]
14pub enum ToolExecutionSource {
15    Llm,
16    Skill,
17    StateAction,
18    OnEnter,
19    OnExit,
20    PostTransition,
21    Spawner,
22    Orchestration,
23    Mock,
24}
25
26/// Structured record for one tool execution observed during eval.
27#[derive(Debug, Clone, Serialize, Deserialize)]
28pub struct ToolExecutionRecord {
29    /// Unique ID for this recorded tool call.
30    pub call_id: String,
31    /// Canonical tool ID executed by the registry.
32    pub tool_id: String,
33    /// Tool name requested by the model or runtime.
34    pub requested_name: String,
35    /// Source category assigned to this execution.
36    pub source: ToolExecutionSource,
37    /// Current or expected state name.
38    #[serde(default, skip_serializing_if = "Option::is_none")]
39    pub state: Option<String>,
40    /// Actor ID associated with this evidence.
41    #[serde(default, skip_serializing_if = "Option::is_none")]
42    pub actor_id: Option<String>,
43    /// Original tool arguments before execution.
44    pub arguments_original: Value,
45    /// Arguments passed to the wrapped tool.
46    pub arguments_executed: Value,
47    /// Whether the operation succeeded.
48    pub success: bool,
49    /// Directory where output artifacts are written.
50    #[serde(default, skip_serializing_if = "Option::is_none")]
51    pub output: Option<Value>,
52    /// Error text for failed execution.
53    #[serde(default, skip_serializing_if = "Option::is_none")]
54    pub error: Option<String>,
55    /// Optional response or tool metadata.
56    #[serde(default, skip_serializing_if = "Option::is_none")]
57    pub metadata: Option<Value>,
58    /// UTC timestamp when execution started.
59    pub started_at: DateTime<Utc>,
60    /// Duration in milliseconds.
61    pub duration_ms: u64,
62    /// Optional observability span ID.
63    #[serde(default, skip_serializing_if = "Option::is_none")]
64    pub observability_span_id: Option<String>,
65}
66
67/// Skill routing evidence inferred or reported for a turn.
68#[derive(Debug, Clone, Serialize, Deserialize)]
69pub struct SkillEvidence {
70    /// Skill ID selected by routing, if available.
71    pub selected_skill_id: Option<String>,
72    /// Skill ID actually executed, if available.
73    pub executed_skill_id: Option<String>,
74    /// Whether routing found no matching skill.
75    pub no_match: bool,
76    /// Whether clarification was requested.
77    pub clarification_requested: bool,
78}
79
80/// Normalized status values for disambiguation evidence.
81#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
82#[serde(rename_all = "snake_case")]
83pub enum DisambiguationStatus {
84    Clear,
85    Skipped,
86    Triggered,
87    Clarified,
88    BestGuess,
89    Abandoned,
90    GiveUp,
91    Escalated,
92}
93
94/// Disambiguation evidence inferred or reported for a turn.
95#[derive(Debug, Clone, Serialize, Deserialize)]
96pub struct DisambiguationEvidence {
97    /// Final or normalized status value.
98    pub status: DisambiguationStatus,
99    /// Ambiguity type reported by detection, if available.
100    #[serde(default, skip_serializing_if = "Option::is_none")]
101    pub ambiguity_type: Option<String>,
102    /// Detection confidence reported by the system.
103    #[serde(default, skip_serializing_if = "Option::is_none")]
104    pub confidence: Option<f32>,
105    /// Resolved payload when available.
106    #[serde(default, skip_serializing_if = "Option::is_none")]
107    pub resolved: Option<Value>,
108}
109
110/// Actor fact evidence captured around one turn.
111#[derive(Debug, Clone, Serialize, Deserialize)]
112pub struct FactsEvidence {
113    /// Actor ID associated with this evidence.
114    pub actor_id: Option<String>,
115    /// Serialized fact records visible after the turn.
116    pub facts: Vec<Value>,
117    /// Number of facts before the turn when known.
118    pub before_count: Option<usize>,
119    /// Number of facts after the turn when known.
120    pub after_count: Option<usize>,
121}
122
123/// Relationship memory evidence captured around one turn.
124#[derive(Debug, Clone, Serialize, Deserialize)]
125pub struct RelationshipEvidence {
126    /// Actor ID associated with this evidence.
127    pub actor_id: Option<String>,
128    /// Model or relationship model name.
129    pub model: Option<String>,
130    /// Perspectives available for assertions.
131    pub available_perspectives: Vec<String>,
132    /// Current serialized relationship state.
133    pub current: Option<Value>,
134    /// State before the turn when available.
135    pub before: Option<Value>,
136    /// State after the turn when available.
137    pub after: Option<Value>,
138}
139
140/// Persona reveal and evolution evidence for one turn.
141#[derive(Debug, Clone, Serialize, Deserialize)]
142pub struct PersonaEvidence {
143    /// Whether any persona secret is currently revealed.
144    pub secret_revealed: bool,
145    /// IDs of revealed secrets when stable IDs are available.
146    pub revealed_secret_ids: Vec<String>,
147    /// Number of revealed secrets.
148    pub revealed_secret_count: usize,
149    /// Number of persona evolution events recorded.
150    pub evolution_events: usize,
151}
152
153/// Observability evidence attached to one evaluated turn.
154#[derive(Debug, Clone, Serialize, Deserialize)]
155pub struct TurnObservabilityEvidence {
156    /// Trace ID associated with the turn when available.
157    pub trace_id: Option<String>,
158    /// Span IDs observed during the turn.
159    pub span_ids: Vec<String>,
160    /// Observability report snapshot generated after the turn.
161    pub report: Option<ObservabilityReport>,
162}
163
164/// Full assertion-time evidence collected after a turn.
165#[derive(Debug, Clone, Serialize, Deserialize)]
166pub struct TurnEvidence {
167    /// Response metadata produced by the runtime.
168    pub response_metadata: Option<Value>,
169    /// Current or expected state name.
170    pub state: Option<String>,
171    /// State transition history observed by the runtime.
172    pub state_history: Vec<ai_agents_core::StateTransitionEvent>,
173    /// Runtime or fixture context value.
174    pub context: Value,
175    /// Tool calls recorded during this turn.
176    pub tool_executions: Vec<ToolExecutionRecord>,
177    /// Skill evidence for this turn, if available.
178    pub skill: Option<SkillEvidence>,
179    /// Expected disambiguation status or evidence.
180    pub disambiguation: Option<DisambiguationEvidence>,
181    /// Serialized fact records visible after the turn.
182    pub facts: Option<FactsEvidence>,
183    /// Relationship memory assertion or evidence.
184    pub relationship: Option<RelationshipEvidence>,
185    /// persona value for TurnEvidence.
186    pub persona: Option<PersonaEvidence>,
187    /// Orchestration metadata assertion or evidence.
188    pub orchestration: Option<Value>,
189    /// Observability assertion, setting, or report value.
190    pub observability: Option<TurnObservabilityEvidence>,
191}
192
193pub fn collect_turn_evidence(
194    agent: &RuntimeAgent,
195    response_metadata: Option<HashMap<String, Value>>,
196    tool_log: &RecordingToolLog,
197    tool_start_index: usize,
198    before_relationship: Option<Value>,
199) -> TurnEvidence {
200    let context_map = agent.get_context();
201    let context = serde_json::to_value(&context_map).unwrap_or(Value::Null);
202    let metadata_value = response_metadata
203        .clone()
204        .and_then(|metadata| serde_json::to_value(metadata).ok());
205    let orchestration = metadata_value
206        .as_ref()
207        .and_then(|metadata| metadata.get("orchestration").cloned())
208        .or_else(|| context.get("orchestration").cloned());
209    let disambiguation = infer_disambiguation(metadata_value.as_ref(), &context);
210    let skill = infer_skill(metadata_value.as_ref(), disambiguation.as_ref());
211    let actor_id = agent.actor_id();
212    let facts = Some(FactsEvidence {
213        actor_id: actor_id.clone(),
214        facts: agent
215            .actor_facts()
216            .into_iter()
217            .filter_map(|fact| serde_json::to_value(fact).ok())
218            .collect(),
219        before_count: None,
220        after_count: Some(agent.actor_facts().len()),
221    });
222    let relationship = collect_relationship(agent, actor_id.clone(), before_relationship);
223    let persona = collect_persona(agent, &context_map);
224    let observability = agent.observability().map(|manager| {
225        let report = manager.generate_report();
226        let raw_events = manager.raw_events();
227        TurnObservabilityEvidence {
228            trace_id: raw_events.last().map(|event| event.trace_id.clone()),
229            span_ids: raw_events
230                .iter()
231                .map(|event| event.span_id.clone())
232                .collect(),
233            report: Some(report),
234        }
235    });
236
237    TurnEvidence {
238        response_metadata: metadata_value,
239        state: agent.current_state(),
240        state_history: agent.state_history(),
241        context,
242        tool_executions: tool_log.records_since(tool_start_index),
243        skill,
244        disambiguation,
245        facts,
246        relationship,
247        persona,
248        orchestration,
249        observability,
250    }
251}
252
253pub fn relationship_snapshot(agent: &RuntimeAgent) -> Option<Value> {
254    let actor_id = agent.actor_id()?;
255    let manager = agent.relationship_manager()?;
256    manager.relationship_as_value(&actor_id).ok().flatten()
257}
258
259fn infer_disambiguation(
260    metadata: Option<&Value>,
261    context: &Value,
262) -> Option<DisambiguationEvidence> {
263    if let Some(disambiguation) = metadata.and_then(|m| m.get("disambiguation")) {
264        let status = match disambiguation
265            .get("status")
266            .and_then(Value::as_str)
267            .unwrap_or("triggered")
268        {
269            "awaiting_clarification" => DisambiguationStatus::Triggered,
270            "clarified" => DisambiguationStatus::Clarified,
271            "best_guess" => DisambiguationStatus::BestGuess,
272            "abandoned" => DisambiguationStatus::Abandoned,
273            "give_up" => DisambiguationStatus::GiveUp,
274            "escalated" => DisambiguationStatus::Escalated,
275            "skipped" => DisambiguationStatus::Skipped,
276            "clear" => DisambiguationStatus::Clear,
277            _ => DisambiguationStatus::Triggered,
278        };
279        let detection = disambiguation.get("detection");
280        return Some(DisambiguationEvidence {
281            status,
282            ambiguity_type: detection.and_then(|d| d.get("type")).map(|v| v.to_string()),
283            confidence: detection
284                .and_then(|d| d.get("confidence"))
285                .and_then(Value::as_f64)
286                .map(|v| v as f32),
287            resolved: disambiguation.get("resolved").cloned(),
288        });
289    }
290
291    if context
292        .pointer("/disambiguation/resolved")
293        .and_then(Value::as_bool)
294        .unwrap_or(false)
295    {
296        return Some(DisambiguationEvidence {
297            status: DisambiguationStatus::Clarified,
298            ambiguity_type: None,
299            confidence: None,
300            resolved: context.get("disambiguation").cloned(),
301        });
302    }
303
304    None
305}
306
307fn infer_skill(
308    metadata: Option<&Value>,
309    disambiguation: Option<&DisambiguationEvidence>,
310) -> Option<SkillEvidence> {
311    let skill_id = metadata
312        .and_then(|m| m.get("skill_id"))
313        .and_then(Value::as_str)
314        .map(str::to_string)
315        .or_else(|| {
316            metadata
317                .and_then(|m| m.get("disambiguation"))
318                .and_then(|d| d.get("skill_id"))
319                .and_then(Value::as_str)
320                .map(str::to_string)
321        });
322
323    if skill_id.is_none() && disambiguation.is_none() {
324        return None;
325    }
326
327    Some(SkillEvidence {
328        selected_skill_id: skill_id.clone(),
329        executed_skill_id: skill_id,
330        no_match: false,
331        clarification_requested: disambiguation
332            .map(|d| d.status == DisambiguationStatus::Triggered)
333            .unwrap_or(false),
334    })
335}
336
337fn collect_relationship(
338    agent: &RuntimeAgent,
339    actor_id: Option<String>,
340    before: Option<Value>,
341) -> Option<RelationshipEvidence> {
342    let actor_id = actor_id?;
343    let manager = agent.relationship_manager()?;
344    let current = manager.relationship_as_value(&actor_id).ok().flatten();
345    let model = current
346        .as_ref()
347        .and_then(|value| value.get("model"))
348        .and_then(Value::as_str)
349        .map(str::to_string);
350    let mut available = vec!["agent_to_actor".to_string(), "mutual".to_string()];
351    if model.as_deref() == Some("two_sided") {
352        available.push("perceived_actor_to_agent".to_string());
353    }
354    Some(RelationshipEvidence {
355        actor_id: Some(actor_id),
356        model,
357        available_perspectives: available,
358        before,
359        after: current.clone(),
360        current,
361    })
362}
363
364fn collect_persona(
365    agent: &RuntimeAgent,
366    context_map: &HashMap<String, Value>,
367) -> Option<PersonaEvidence> {
368    let manager = agent.persona_manager()?;
369    let revealed_count = manager.revealed_secrets(context_map).len();
370    Some(PersonaEvidence {
371        secret_revealed: revealed_count > 0,
372        revealed_secret_ids: Vec::new(),
373        revealed_secret_count: revealed_count,
374        evolution_events: manager.history().len(),
375    })
376}