ai_agents_eval/
assertion.rs

1use serde::{Deserialize, Serialize};
2use serde_json::{Value, json};
3use std::collections::HashMap;
4
5use crate::evidence::{DisambiguationStatus, ToolExecutionRecord, TurnEvidence};
6use crate::judge::{JudgeAssertion, JudgeInput, JudgeResolver};
7
8/// Collection of assertion clauses evaluated against one turn.
9#[derive(Debug, Clone, Deserialize, Serialize, Default)]
10pub struct Assertion {
11    /// Current or expected state name.
12    #[serde(default)]
13    pub state: Option<String>,
14    /// Allowed current state names.
15    #[serde(default)]
16    pub state_in: Option<Vec<String>>,
17    /// State name that must not be current.
18    #[serde(default)]
19    pub state_not: Option<String>,
20    /// State name expected in transition history.
21    #[serde(default)]
22    pub state_history_contains: Option<String>,
23    /// Required substring or substrings in the response.
24    #[serde(default)]
25    pub response_contains: Option<StringList>,
26    /// Response passes when any listed substring is present.
27    #[serde(default)]
28    pub response_contains_any: Option<StringList>,
29    /// Substring or substrings that must be absent from the response.
30    #[serde(default)]
31    pub response_not_contains: Option<StringList>,
32    /// Whether the response must contain non-whitespace text.
33    #[serde(default)]
34    pub response_not_empty: Option<bool>,
35    /// Semantic response-quality judge assertion.
36    #[serde(default)]
37    pub response_semantic: Option<JudgeAssertion>,
38    /// Expected disambiguation status or evidence.
39    #[serde(default)]
40    pub disambiguation: Option<DisambiguationExpectation>,
41    /// Whether no active disambiguation should have occurred.
42    #[serde(default)]
43    pub no_disambiguation: Option<bool>,
44    /// Tool call assertion in string or object form.
45    #[serde(default)]
46    pub tool_called: Option<ToolCalledAssertion>,
47    /// Tool ID that must not appear in tool evidence.
48    #[serde(default)]
49    pub tool_not_called: Option<String>,
50    /// Skill ID expected in skill evidence.
51    #[serde(default)]
52    pub skill_triggered: Option<String>,
53    /// Top-level response metadata key-value expectations.
54    #[serde(default)]
55    pub metadata_contains: Option<HashMap<String, Value>>,
56    /// Dot-path assertion over response metadata.
57    #[serde(default)]
58    pub metadata_path: Option<PathAssertion>,
59    /// Dot-path assertion over runtime context.
60    #[serde(default)]
61    pub context_path: Option<PathAssertion>,
62    /// Fact assertion for actor memory evidence.
63    #[serde(default)]
64    pub facts_include: Option<FactsAssertion>,
65    /// Relationship memory assertion or evidence.
66    #[serde(default)]
67    pub relationship: Option<RelationshipAssertion>,
68    /// Persona secret reveal assertion.
69    #[serde(default)]
70    pub persona_secret_revealed: Option<SecretAssertion>,
71    /// Orchestration metadata assertion or evidence.
72    #[serde(default)]
73    pub orchestration: Option<OrchestrationAssertion>,
74    /// Observability assertion, setting, or report value.
75    #[serde(default)]
76    pub observability: Option<ObservabilityAssertion>,
77    /// LLM judge assertion or resolver for semantic quality.
78    #[serde(default)]
79    pub judge: Option<JudgeAssertion>,
80    /// Child assertions where at least one must pass.
81    #[serde(default)]
82    pub any: Option<Vec<Assertion>>,
83    /// Child assertions where every child must pass.
84    #[serde(default)]
85    pub all: Option<Vec<Assertion>>,
86    /// Child assertion that must fail.
87    #[serde(default)]
88    pub not: Option<Box<Assertion>>,
89}
90
91/// YAML helper accepting either one string or a list of strings.
92#[derive(Debug, Clone, Deserialize, Serialize)]
93#[serde(untagged)]
94pub enum StringList {
95    One(String),
96    Many(Vec<String>),
97}
98
99impl StringList {
100    fn items(&self) -> Vec<String> {
101        match self {
102            Self::One(value) => vec![value.clone()],
103            Self::Many(values) => values.clone(),
104        }
105    }
106}
107
108/// String or object form for tool call assertions.
109#[derive(Debug, Clone, Deserialize, Serialize)]
110#[serde(untagged)]
111pub enum ToolCalledAssertion {
112    Id(String),
113    Object(ToolCalledObject),
114}
115
116/// Object form for checking tool execution evidence.
117#[derive(Debug, Clone, Deserialize, Serialize, Default)]
118pub struct ToolCalledObject {
119    /// Stable identifier for this item.
120    #[serde(default)]
121    pub id: Option<String>,
122    /// Exact number of matching items required.
123    #[serde(default)]
124    pub count: Option<usize>,
125    /// Minimum number of matching items required.
126    #[serde(default)]
127    pub count_gte: Option<usize>,
128    /// Whether the operation succeeded.
129    #[serde(default)]
130    pub success: Option<bool>,
131    /// Allowed source labels for matching records.
132    #[serde(default)]
133    pub source_in: Option<Vec<String>>,
134    /// Alias for checking executed tool arguments.
135    #[serde(default)]
136    pub args: Option<PathAssertion>,
137    /// Path assertion over original tool arguments.
138    #[serde(default)]
139    pub args_original: Option<PathAssertion>,
140    /// Path assertion over executed tool arguments.
141    #[serde(default)]
142    pub args_executed: Option<PathAssertion>,
143    /// Path assertion over parsed tool output.
144    #[serde(default)]
145    pub result_path: Option<PathAssertion>,
146}
147
148/// Expected disambiguation state for an assertion.
149#[derive(Debug, Clone, Deserialize, Serialize)]
150#[serde(rename_all = "snake_case")]
151pub enum DisambiguationExpectation {
152    Triggered,
153    Skipped,
154    Clarified,
155    Abandoned,
156    GiveUp,
157    Escalated,
158    BestGuess,
159    Clear,
160}
161
162/// Dot-path assertion used for metadata, context, tools, and metrics.
163#[derive(Debug, Clone, Deserialize, Serialize, Default)]
164pub struct PathAssertion {
165    /// Path used for file lookup, HTTP routing, or dot-path checks.
166    pub path: String,
167    /// Expected JSON value for exact equality.
168    #[serde(default)]
169    pub eq: Option<Value>,
170    /// JSON value that must not equal the actual value.
171    #[serde(default)]
172    pub neq: Option<Value>,
173    /// Allowed JSON values for membership checks.
174    #[serde(default, rename = "in")]
175    pub in_values: Option<Vec<Value>>,
176    /// String substring or array element expected in the actual value.
177    #[serde(default)]
178    pub contains: Option<Value>,
179    /// Whether the path must exist or be absent.
180    #[serde(default)]
181    pub exists: Option<bool>,
182    /// Numeric lower bound using greater-than-or-equal.
183    #[serde(default)]
184    pub gte: Option<f64>,
185    /// Numeric upper bound using less-than-or-equal.
186    #[serde(default)]
187    pub lte: Option<f64>,
188    /// Numeric lower bound using greater-than.
189    #[serde(default)]
190    pub gt: Option<f64>,
191    /// Numeric upper bound using less-than.
192    #[serde(default)]
193    pub lt: Option<f64>,
194}
195
196/// Assertion over actor facts collected after a turn.
197#[derive(Debug, Clone, Deserialize, Serialize, Default)]
198pub struct FactsAssertion {
199    /// Actor ID used for this scenario, turn, or assertion.
200    #[serde(default)]
201    pub actor: Option<String>,
202    /// Fact category that must be present.
203    #[serde(default)]
204    pub category: Option<String>,
205    /// Semantic claim checked by a judge.
206    #[serde(default)]
207    pub semantic: Option<String>,
208}
209
210/// Assertion over actor relationship memory evidence.
211#[derive(Debug, Clone, Deserialize, Serialize, Default)]
212pub struct RelationshipAssertion {
213    /// Actor ID used for this scenario, turn, or assertion.
214    #[serde(default)]
215    pub actor: Option<String>,
216    /// Whether the path must exist or be absent.
217    #[serde(default)]
218    pub exists: Option<bool>,
219    /// Relationship perspective to inspect.
220    #[serde(default)]
221    pub perspective: Option<String>,
222    /// Relationship dimension to compare.
223    #[serde(default)]
224    pub dimension: Option<String>,
225    /// Numeric lower bound using greater-than-or-equal.
226    #[serde(default)]
227    pub gte: Option<f64>,
228    /// Numeric upper bound using less-than-or-equal.
229    #[serde(default)]
230    pub lte: Option<f64>,
231    /// Numeric lower bound using greater-than.
232    #[serde(default)]
233    pub gt: Option<f64>,
234    /// Numeric upper bound using less-than.
235    #[serde(default)]
236    pub lt: Option<f64>,
237    /// Expected JSON value for exact equality.
238    #[serde(default)]
239    pub eq: Option<f64>,
240    /// Minimum interaction count expected.
241    #[serde(default)]
242    pub interaction_count_gte: Option<u64>,
243    /// Minimum notable event count expected.
244    #[serde(default)]
245    pub notable_event_count_gte: Option<usize>,
246}
247
248/// Boolean or ID form for persona secret assertions.
249#[derive(Debug, Clone, Deserialize, Serialize)]
250#[serde(untagged)]
251pub enum SecretAssertion {
252    Bool(bool),
253    Id(String),
254}
255
256/// Assertion over orchestration metadata attached to a turn.
257#[derive(Debug, Clone, Deserialize, Serialize, Default)]
258pub struct OrchestrationAssertion {
259    /// Expected orchestration pattern label.
260    #[serde(default)]
261    pub pattern: Option<String>,
262    /// Expected orchestration type when YAML uses the type key.
263    #[serde(default, rename = "type")]
264    pub type_name: Option<String>,
265    /// Allowed final agent IDs.
266    #[serde(default)]
267    pub final_agent_in: Option<Vec<String>>,
268    /// Agent IDs expected somewhere in orchestration metadata.
269    #[serde(default)]
270    pub agents_include: Option<Vec<String>>,
271    /// Exact number of pipeline or stage records expected.
272    #[serde(default)]
273    pub stages: Option<usize>,
274}
275
276/// Assertion over the observability report for a turn.
277#[derive(Debug, Clone, Deserialize, Serialize, Default)]
278pub struct ObservabilityAssertion {
279    /// Upper bound for total LLM calls.
280    #[serde(default)]
281    pub total_llm_calls_lte: Option<u64>,
282    /// Upper bound for total tool calls.
283    #[serde(default)]
284    pub total_tool_calls_lte: Option<u64>,
285    /// Upper bound for total tokens.
286    #[serde(default)]
287    pub total_tokens_lte: Option<u64>,
288    /// Upper bound for total estimated cost in USD.
289    #[serde(default)]
290    pub total_cost_usd_lte: Option<f64>,
291    /// Path assertions over counts grouped by purpose.
292    #[serde(default)]
293    pub purpose_counts: HashMap<String, PathAssertion>,
294    /// Path assertions over counts grouped by status.
295    #[serde(default)]
296    pub status_counts: HashMap<String, PathAssertion>,
297    /// Path assertions over counts matching configured dimensions.
298    #[serde(default)]
299    pub dimension_counts: Vec<ObservabilityDimensionAssertion>,
300}
301
302/// Assertion over observability metrics matching all listed dimensions.
303#[derive(Debug, Clone, Deserialize, Serialize, Default)]
304pub struct ObservabilityDimensionAssertion {
305    #[serde(default)]
306    pub match_dimensions: HashMap<String, String>,
307    #[serde(rename = "assert")]
308    pub assertion: PathAssertion,
309}
310
311/// Result detail for one evaluated assertion clause.
312#[derive(Debug, Clone, Serialize)]
313pub struct AssertionResultDetail {
314    /// Stable name of the assertion that produced this detail.
315    pub assertion: String,
316    /// Passed count or boolean result.
317    pub passed: bool,
318    /// Actual value observed during evaluation.
319    pub actual: Value,
320    /// Expected assertion object for a generated turn.
321    pub expected: Value,
322    /// Optional failure message for debugging.
323    pub message: Option<String>,
324}
325
326/// Final outcome returned by assertion evaluation.
327pub enum AssertionOutcome {
328    Passed(Vec<AssertionResultDetail>),
329    Failed(Vec<AssertionResultDetail>),
330    Error(String),
331}
332
333impl AssertionResultDetail {
334    fn pass(name: impl Into<String>, actual: Value, expected: Value) -> Self {
335        Self {
336            assertion: name.into(),
337            passed: true,
338            actual,
339            expected,
340            message: None,
341        }
342    }
343
344    fn fail(
345        name: impl Into<String>,
346        actual: Value,
347        expected: Value,
348        message: impl Into<String>,
349    ) -> Self {
350        Self {
351            assertion: name.into(),
352            passed: false,
353            actual,
354            expected,
355            message: Some(message.into()),
356        }
357    }
358}
359
360/// Runtime inputs needed while evaluating one assertion tree.
361#[derive(Clone, Copy)]
362pub struct AssertionEvalContext<'a> {
363    /// Full assertion-time evidence for this turn.
364    pub evidence: &'a TurnEvidence,
365    /// Assistant response text or redacted output value.
366    pub response: &'a str,
367    /// Optional user input for judge prompt context.
368    pub user_input: Option<&'a str>,
369    /// Optional scenario ID for judge prompt context.
370    pub scenario_id: Option<&'a str>,
371    /// Optional language label for filtering, metrics, and judge context.
372    pub language: Option<&'a str>,
373    /// Optional resolver for semantic judge assertions.
374    pub judge_resolver: Option<&'a JudgeResolver>,
375}
376
377pub async fn evaluate_assertion(
378    assertion: &Assertion,
379    context: AssertionEvalContext<'_>,
380) -> AssertionOutcome {
381    let mut details = Vec::new();
382
383    if let Some(children) = &assertion.all {
384        for child in children {
385            match evaluate_assertion_boxed(child, context).await {
386                AssertionOutcome::Passed(mut d) => details.append(&mut d),
387                AssertionOutcome::Failed(mut d) => {
388                    details.append(&mut d);
389                    return AssertionOutcome::Failed(details);
390                }
391                AssertionOutcome::Error(e) => return AssertionOutcome::Error(e),
392            }
393        }
394        details.push(AssertionResultDetail::pass("all", json!(true), json!(true)));
395    }
396
397    if let Some(children) = &assertion.any {
398        let mut failures = Vec::new();
399        for child in children {
400            match evaluate_assertion_boxed(child, context).await {
401                AssertionOutcome::Passed(mut d) => {
402                    details.append(&mut d);
403                    details.push(AssertionResultDetail::pass("any", json!(true), json!(true)));
404                    return AssertionOutcome::Passed(details);
405                }
406                AssertionOutcome::Failed(mut d) => failures.append(&mut d),
407                AssertionOutcome::Error(e) => failures.push(AssertionResultDetail::fail(
408                    "any_branch_error",
409                    json!(e),
410                    json!("pass"),
411                    "branch error",
412                )),
413            }
414        }
415        details.extend(failures);
416        details.push(AssertionResultDetail::fail(
417            "any",
418            json!(false),
419            json!(true),
420            "no branch passed",
421        ));
422    }
423
424    if let Some(child) = &assertion.not {
425        match evaluate_assertion_boxed(child, context).await {
426            AssertionOutcome::Passed(_) => details.push(AssertionResultDetail::fail(
427                "not",
428                json!(true),
429                json!(false),
430                "child assertion passed",
431            )),
432            AssertionOutcome::Failed(_) => details.push(AssertionResultDetail::pass(
433                "not",
434                json!(false),
435                json!(false),
436            )),
437            AssertionOutcome::Error(e) => return AssertionOutcome::Error(e),
438        }
439    }
440
441    evaluate_simple(assertion, context, &mut details).await;
442
443    if details.iter().any(|d| !d.passed) {
444        AssertionOutcome::Failed(details)
445    } else {
446        AssertionOutcome::Passed(details)
447    }
448}
449
450fn evaluate_assertion_boxed<'a>(
451    assertion: &'a Assertion,
452    context: AssertionEvalContext<'a>,
453) -> std::pin::Pin<Box<dyn std::future::Future<Output = AssertionOutcome> + Send + 'a>> {
454    Box::pin(evaluate_assertion(assertion, context))
455}
456
457async fn evaluate_simple(
458    assertion: &Assertion,
459    context: AssertionEvalContext<'_>,
460    details: &mut Vec<AssertionResultDetail>,
461) {
462    let evidence = context.evidence;
463    let response = context.response;
464    if let Some(expected) = &assertion.state {
465        check_eq("state", evidence.state.clone(), expected.clone(), details);
466    }
467    if let Some(expected) = &assertion.state_in {
468        push_bool(
469            "state_in",
470            evidence
471                .state
472                .as_ref()
473                .is_some_and(|s| expected.contains(s)),
474            json!(evidence.state),
475            json!(expected),
476            details,
477        );
478    }
479    if let Some(expected) = &assertion.state_not {
480        push_bool(
481            "state_not",
482            evidence.state.as_ref().is_none_or(|s| s != expected),
483            json!(evidence.state),
484            json!(expected),
485            details,
486        );
487    }
488    if let Some(expected) = &assertion.state_history_contains {
489        let passed = evidence
490            .state_history
491            .iter()
492            .any(|event| &event.to == expected || &event.from == expected);
493        push_bool(
494            "state_history_contains",
495            passed,
496            json!(evidence.state_history),
497            json!(expected),
498            details,
499        );
500    }
501    if let Some(expected) = &assertion.response_contains {
502        for item in expected.items() {
503            push_bool(
504                "response_contains",
505                response.contains(&item),
506                json!(response),
507                json!(item),
508                details,
509            );
510        }
511    }
512    if let Some(expected) = &assertion.response_contains_any {
513        let items = expected.items();
514        push_bool(
515            "response_contains_any",
516            items.iter().any(|item| response.contains(item)),
517            json!(response),
518            json!(items),
519            details,
520        );
521    }
522    if let Some(expected) = &assertion.response_not_contains {
523        for item in expected.items() {
524            push_bool(
525                "response_not_contains",
526                !response.contains(&item),
527                json!(response),
528                json!(item),
529                details,
530            );
531        }
532    }
533    if let Some(expected) = assertion.response_not_empty {
534        push_bool(
535            "response_not_empty",
536            (!response.trim().is_empty()) == expected,
537            json!(response),
538            json!(expected),
539            details,
540        );
541    }
542    if let Some(expected) = &assertion.disambiguation {
543        let actual = evidence.disambiguation.as_ref().map(|d| &d.status);
544        push_bool(
545            "disambiguation",
546            actual.is_some_and(|status| disambiguation_matches(status, expected)),
547            json!(actual),
548            json!(expected),
549            details,
550        );
551    }
552    if let Some(expected) = assertion.no_disambiguation {
553        let triggered = evidence.disambiguation.as_ref().is_some_and(|d| {
554            matches!(
555                d.status,
556                DisambiguationStatus::Triggered
557                    | DisambiguationStatus::Clarified
558                    | DisambiguationStatus::BestGuess
559            )
560        });
561        push_bool(
562            "no_disambiguation",
563            (!triggered) == expected,
564            json!(!triggered),
565            json!(expected),
566            details,
567        );
568    }
569    if let Some(tool) = &assertion.tool_called {
570        evaluate_tool_called(tool, evidence, details);
571    }
572    if let Some(tool_id) = &assertion.tool_not_called {
573        let passed = !evidence
574            .tool_executions
575            .iter()
576            .any(|record| &record.tool_id == tool_id || &record.requested_name == tool_id);
577        push_bool(
578            "tool_not_called",
579            passed,
580            json!(tool_id),
581            json!("not called"),
582            details,
583        );
584    }
585    if let Some(skill_id) = &assertion.skill_triggered {
586        let passed = evidence.skill.as_ref().is_some_and(|skill| {
587            skill.selected_skill_id.as_deref() == Some(skill_id)
588                || skill.executed_skill_id.as_deref() == Some(skill_id)
589        });
590        push_bool(
591            "skill_triggered",
592            passed,
593            json!(evidence.skill),
594            json!(skill_id),
595            details,
596        );
597    }
598    if let Some(expected) = &assertion.metadata_contains {
599        evaluate_metadata_contains(expected, evidence, details);
600    }
601    if let Some(path) = &assertion.metadata_path {
602        evaluate_path(
603            "metadata_path",
604            evidence.response_metadata.as_ref(),
605            path,
606            details,
607        );
608    }
609    if let Some(path) = &assertion.context_path {
610        evaluate_path("context_path", Some(&evidence.context), path, details);
611    }
612    if let Some(expected) = &assertion.facts_include {
613        evaluate_facts(expected, evidence, context.judge_resolver, details).await;
614    }
615    if let Some(expected) = &assertion.relationship {
616        evaluate_relationship(expected, evidence, details);
617    }
618    if let Some(expected) = &assertion.persona_secret_revealed {
619        evaluate_secret(expected, evidence, details);
620    }
621    if let Some(expected) = &assertion.orchestration {
622        evaluate_orchestration(expected, evidence, details);
623    }
624    if let Some(expected) = &assertion.observability {
625        evaluate_observability(expected, evidence, details);
626    }
627    if let Some(criteria) = assertion
628        .judge
629        .as_ref()
630        .or(assertion.response_semantic.as_ref())
631    {
632        if let Some(resolver) = context.judge_resolver {
633            match resolver.resolve(criteria.llm.as_deref()) {
634                Ok(judge) => match judge
635                    .evaluate_input(
636                        JudgeInput {
637                            response,
638                            user_input: context.user_input,
639                            scenario_id: context.scenario_id,
640                            language: context.language,
641                        },
642                        criteria,
643                    )
644                    .await
645                {
646                    Ok(result) => push_bool(
647                        "judge",
648                        result.passed,
649                        json!(result.overall_score),
650                        json!(criteria.pass_threshold),
651                        details,
652                    ),
653                    Err(error) => details.push(AssertionResultDetail::fail(
654                        "judge",
655                        json!(error.to_string()),
656                        json!("valid judge result"),
657                        "judge failed",
658                    )),
659                },
660                Err(error) => details.push(AssertionResultDetail::fail(
661                    "judge",
662                    json!(error.to_string()),
663                    json!("available judge LLM"),
664                    "judge failed",
665                )),
666            }
667        } else {
668            details.push(AssertionResultDetail::fail(
669                "judge",
670                json!(null),
671                json!("judge configured"),
672                "no judge LLM available",
673            ));
674        }
675    }
676}
677
678fn check_eq<T: PartialEq + Serialize>(
679    name: &str,
680    actual: Option<T>,
681    expected: T,
682    details: &mut Vec<AssertionResultDetail>,
683) {
684    push_bool(
685        name,
686        actual.as_ref().is_some_and(|a| *a == expected),
687        json!(actual),
688        json!(expected),
689        details,
690    );
691}
692fn push_bool(
693    name: &str,
694    passed: bool,
695    actual: Value,
696    expected: Value,
697    details: &mut Vec<AssertionResultDetail>,
698) {
699    if passed {
700        details.push(AssertionResultDetail::pass(name, actual, expected));
701    } else {
702        details.push(AssertionResultDetail::fail(
703            name,
704            actual,
705            expected,
706            "assertion failed",
707        ));
708    }
709}
710
711fn disambiguation_matches(
712    actual: &DisambiguationStatus,
713    expected: &DisambiguationExpectation,
714) -> bool {
715    matches!(
716        (actual, expected),
717        (
718            DisambiguationStatus::Triggered,
719            DisambiguationExpectation::Triggered
720        ) | (
721            DisambiguationStatus::Skipped,
722            DisambiguationExpectation::Skipped
723        ) | (
724            DisambiguationStatus::Clarified,
725            DisambiguationExpectation::Clarified
726        ) | (
727            DisambiguationStatus::Abandoned,
728            DisambiguationExpectation::Abandoned
729        ) | (
730            DisambiguationStatus::GiveUp,
731            DisambiguationExpectation::GiveUp
732        ) | (
733            DisambiguationStatus::Escalated,
734            DisambiguationExpectation::Escalated
735        ) | (
736            DisambiguationStatus::BestGuess,
737            DisambiguationExpectation::BestGuess
738        ) | (
739            DisambiguationStatus::Clear,
740            DisambiguationExpectation::Clear
741        )
742    )
743}
744
745fn evaluate_tool_called(
746    assertion: &ToolCalledAssertion,
747    evidence: &TurnEvidence,
748    details: &mut Vec<AssertionResultDetail>,
749) {
750    let (id, object) = match assertion {
751        ToolCalledAssertion::Id(id) => (Some(id.as_str()), None),
752        ToolCalledAssertion::Object(object) => (object.id.as_deref(), Some(object)),
753    };
754    let mut records: Vec<&ToolExecutionRecord> = evidence.tool_executions.iter().collect();
755    if let Some(id) = id {
756        records.retain(|record| record.tool_id == id || record.requested_name == id);
757    }
758    if let Some(object) = object {
759        if let Some(success) = object.success {
760            records.retain(|record| record.success == success);
761        }
762        if let Some(sources) = &object.source_in {
763            records.retain(|record| {
764                sources
765                    .iter()
766                    .any(|source| *source == serde_plain_source(&record.source))
767            });
768        }
769    }
770    let count = records.len();
771    let mut passed = count > 0;
772    if let Some(object) = object {
773        if let Some(expected) = object.count {
774            passed &= count == expected;
775        }
776        if let Some(expected) = object.count_gte {
777            passed &= count >= expected;
778        }
779        if let Some(path) = object.args.as_ref().or(object.args_executed.as_ref()) {
780            passed &= records
781                .iter()
782                .any(|record| path_matches(&record.arguments_executed, path));
783        }
784        if let Some(path) = &object.args_original {
785            passed &= records
786                .iter()
787                .any(|record| path_matches(&record.arguments_original, path));
788        }
789        if let Some(path) = &object.result_path {
790            passed &= records.iter().any(|record| {
791                record
792                    .output
793                    .as_ref()
794                    .is_some_and(|value| path_matches(value, path))
795            });
796        }
797    }
798    push_bool(
799        "tool_called",
800        passed,
801        json!(count),
802        json!(assertion),
803        details,
804    );
805}
806
807fn serde_plain_source(source: &crate::evidence::ToolExecutionSource) -> String {
808    serde_json::to_string(source)
809        .unwrap_or_default()
810        .trim_matches('"')
811        .to_string()
812}
813fn evaluate_metadata_contains(
814    expected: &HashMap<String, Value>,
815    evidence: &TurnEvidence,
816    details: &mut Vec<AssertionResultDetail>,
817) {
818    let metadata = evidence.response_metadata.as_ref();
819    let passed = metadata.is_some_and(|metadata| {
820        expected
821            .iter()
822            .all(|(key, expected)| metadata.get(key) == Some(expected))
823    }) || (expected.is_empty() && metadata.is_none());
824    push_bool(
825        "metadata_contains",
826        passed,
827        json!(metadata),
828        json!(expected),
829        details,
830    );
831}
832
833fn evaluate_path(
834    name: &str,
835    root: Option<&Value>,
836    assertion: &PathAssertion,
837    details: &mut Vec<AssertionResultDetail>,
838) {
839    let actual = root.and_then(|value| get_path(value, &assertion.path));
840    push_bool(
841        name,
842        path_actual_matches(actual, assertion),
843        json!(actual),
844        json!(assertion),
845        details,
846    );
847}
848fn path_matches(root: &Value, assertion: &PathAssertion) -> bool {
849    path_actual_matches(get_path(root, &assertion.path), assertion)
850}
851
852fn path_actual_matches(actual: Option<&Value>, assertion: &PathAssertion) -> bool {
853    if let Some(exists) = assertion.exists {
854        if exists != actual.is_some() {
855            return false;
856        }
857    }
858    let Some(actual) = actual else {
859        return assertion.exists == Some(false);
860    };
861    if let Some(expected) = &assertion.eq {
862        if actual != expected {
863            return false;
864        }
865    }
866    if let Some(expected) = &assertion.neq {
867        if actual == expected {
868            return false;
869        }
870    }
871    if let Some(values) = &assertion.in_values {
872        if !values.contains(actual) {
873            return false;
874        }
875    }
876    if let Some(expected) = &assertion.contains {
877        let contains = match (actual, expected) {
878            (Value::String(a), Value::String(e)) => a.contains(e),
879            (Value::Array(arr), e) => arr.contains(e),
880            _ => false,
881        };
882        if !contains {
883            return false;
884        }
885    }
886    if let Some(expected) = assertion.gte {
887        if actual.as_f64().unwrap_or(f64::NAN) < expected {
888            return false;
889        }
890    }
891    if let Some(expected) = assertion.lte {
892        if actual.as_f64().unwrap_or(f64::NAN) > expected {
893            return false;
894        }
895    }
896    if let Some(expected) = assertion.gt {
897        if actual.as_f64().unwrap_or(f64::NAN) <= expected {
898            return false;
899        }
900    }
901    if let Some(expected) = assertion.lt {
902        if actual.as_f64().unwrap_or(f64::NAN) >= expected {
903            return false;
904        }
905    }
906    true
907}
908
909fn get_path<'a>(value: &'a Value, path: &str) -> Option<&'a Value> {
910    if path.is_empty() {
911        return Some(value);
912    }
913    let mut current = value;
914    for part in path.split('.') {
915        current = current.get(part)?;
916    }
917    Some(current)
918}
919
920async fn evaluate_facts(
921    assertion: &FactsAssertion,
922    evidence: &TurnEvidence,
923    judge_resolver: Option<&JudgeResolver>,
924    details: &mut Vec<AssertionResultDetail>,
925) {
926    let Some(fact_evidence) = &evidence.facts else {
927        push_bool(
928            "facts_include",
929            false,
930            json!(null),
931            json!(assertion),
932            details,
933        );
934        return;
935    };
936    if let Some(actor) = &assertion.actor {
937        if fact_evidence.actor_id.as_deref() != Some(actor.as_str()) {
938            push_bool(
939                "facts_include",
940                false,
941                json!(fact_evidence.actor_id),
942                json!(actor),
943                details,
944            );
945            return;
946        }
947    }
948    let facts: Vec<Value> = fact_evidence
949        .facts
950        .iter()
951        .filter(|fact| {
952            assertion.category.as_ref().is_none_or(|category| {
953                fact.get("category")
954                    .map(|value| value.to_string().trim_matches('"').to_string())
955                    .is_some_and(|actual| actual == *category || actual.ends_with(category))
956            })
957        })
958        .cloned()
959        .collect();
960    let mut passed = !facts.is_empty();
961    if let Some(semantic) = &assertion.semantic {
962        if let Some(resolver) = judge_resolver {
963            match resolver.resolve(None) {
964                Ok(judge) => {
965                    let criteria = JudgeAssertion {
966                        llm: None,
967                        pass_threshold: 0.75,
968                        criteria: vec![crate::judge::JudgeCriterion::Text(format!(
969                            "The fact set supports this claim: {}",
970                            semantic
971                        ))],
972                    };
973                    let fact_text = serde_json::to_string(&facts).unwrap_or_default();
974                    match judge.evaluate(&fact_text, &criteria).await {
975                        Ok(result) => passed &= result.passed,
976                        Err(error) => {
977                            details.push(AssertionResultDetail::fail(
978                                "facts_include",
979                                json!(error.to_string()),
980                                json!(semantic),
981                                "fact semantic judge failed",
982                            ));
983                            return;
984                        }
985                    }
986                }
987                Err(error) => {
988                    details.push(AssertionResultDetail::fail(
989                        "facts_include",
990                        json!(error.to_string()),
991                        json!(semantic),
992                        "fact semantic judge failed",
993                    ));
994                    return;
995                }
996            }
997        } else {
998            details.push(AssertionResultDetail::fail(
999                "facts_include",
1000                json!(null),
1001                json!(semantic),
1002                "semantic fact assertion requires a judge LLM",
1003            ));
1004            return;
1005        }
1006    }
1007    push_bool(
1008        "facts_include",
1009        passed,
1010        json!(facts),
1011        json!(assertion),
1012        details,
1013    );
1014}
1015
1016fn evaluate_relationship(
1017    assertion: &RelationshipAssertion,
1018    evidence: &TurnEvidence,
1019    details: &mut Vec<AssertionResultDetail>,
1020) {
1021    let Some(rel) = &evidence.relationship else {
1022        push_bool(
1023            "relationship",
1024            assertion.exists == Some(false),
1025            json!(null),
1026            json!(assertion),
1027            details,
1028        );
1029        return;
1030    };
1031    if let Some(actor) = &assertion.actor {
1032        if rel.actor_id.as_deref() != Some(actor.as_str()) {
1033            push_bool(
1034                "relationship",
1035                false,
1036                json!(rel.actor_id),
1037                json!(actor),
1038                details,
1039            );
1040            return;
1041        }
1042    }
1043    let current = rel.current.as_ref();
1044    let mut passed = assertion
1045        .exists
1046        .map(|expected| expected == current.is_some())
1047        .unwrap_or(true);
1048    let perspective = assertion.perspective.as_deref().unwrap_or("agent_to_actor");
1049    if !rel.available_perspectives.iter().any(|p| p == perspective) {
1050        details.push(AssertionResultDetail::fail(
1051            "relationship",
1052            json!(rel.available_perspectives),
1053            json!(perspective),
1054            "relationship perspective unavailable for model",
1055        ));
1056        return;
1057    }
1058    if let Some(count) = assertion.interaction_count_gte {
1059        let actual = current
1060            .and_then(|v| v.get("interaction_count"))
1061            .and_then(Value::as_u64)
1062            .unwrap_or(0);
1063        passed &= actual >= count;
1064    }
1065    if let Some(count) = assertion.notable_event_count_gte {
1066        let actual = current
1067            .and_then(|v| v.get("notable_events"))
1068            .and_then(Value::as_array)
1069            .map(Vec::len)
1070            .unwrap_or(0);
1071        passed &= actual >= count;
1072    }
1073    if let Some(dimension) = &assertion.dimension {
1074        let value = relationship_dimension_value(current, perspective, dimension);
1075        let mut dim_pass = value.is_some();
1076        if let Some(v) = assertion.gte {
1077            dim_pass &= value.unwrap_or(f64::NAN) >= v;
1078        }
1079        if let Some(v) = assertion.lte {
1080            dim_pass &= value.unwrap_or(f64::NAN) <= v;
1081        }
1082        if let Some(v) = assertion.gt {
1083            dim_pass &= value.unwrap_or(f64::NAN) > v;
1084        }
1085        if let Some(v) = assertion.lt {
1086            dim_pass &= value.unwrap_or(f64::NAN) < v;
1087        }
1088        if let Some(v) = assertion.eq {
1089            dim_pass &= (value.unwrap_or(f64::NAN) - v).abs() < f64::EPSILON;
1090        }
1091        passed &= dim_pass;
1092    }
1093    push_bool(
1094        "relationship",
1095        passed,
1096        json!(current),
1097        json!(assertion),
1098        details,
1099    );
1100}
1101
1102fn relationship_dimension_value(
1103    current: Option<&Value>,
1104    perspective: &str,
1105    dimension: &str,
1106) -> Option<f64> {
1107    let current = current?;
1108    match perspective {
1109        "agent_to_actor" => current.get("dimensions")?.get(dimension)?.as_f64(),
1110        "perceived_actor_to_agent" => current
1111            .get("perceived_actor_to_agent")?
1112            .get(dimension)?
1113            .as_f64(),
1114        "mutual" => current.get("dimensions")?.get(dimension)?.as_f64(),
1115        _ => None,
1116    }
1117}
1118
1119fn evaluate_secret(
1120    assertion: &SecretAssertion,
1121    evidence: &TurnEvidence,
1122    details: &mut Vec<AssertionResultDetail>,
1123) {
1124    let persona = evidence.persona.as_ref();
1125    let actual = persona.is_some_and(|p| p.secret_revealed);
1126    let passed = match assertion {
1127        SecretAssertion::Bool(expected) => actual == *expected,
1128        SecretAssertion::Id(id) => persona.is_some_and(|p| p.revealed_secret_ids.contains(id)),
1129    };
1130    push_bool(
1131        "persona_secret_revealed",
1132        passed,
1133        json!(actual),
1134        json!(assertion),
1135        details,
1136    );
1137}
1138
1139fn evaluate_orchestration(
1140    assertion: &OrchestrationAssertion,
1141    evidence: &TurnEvidence,
1142    details: &mut Vec<AssertionResultDetail>,
1143) {
1144    let Some(value) = &evidence.orchestration else {
1145        push_bool(
1146            "orchestration",
1147            false,
1148            json!(null),
1149            json!(assertion),
1150            details,
1151        );
1152        return;
1153    };
1154    let mut passed = true;
1155    if let Some(pattern) = assertion.pattern.as_ref().or(assertion.type_name.as_ref()) {
1156        passed &= value
1157            .get("type")
1158            .or_else(|| value.get("pattern"))
1159            .and_then(Value::as_str)
1160            == Some(pattern.as_str());
1161    }
1162    if let Some(finals) = &assertion.final_agent_in {
1163        let actual = value
1164            .get("final_agent")
1165            .or_else(|| value.get("to_agent"))
1166            .or_else(|| value.get("agent"))
1167            .and_then(Value::as_str);
1168        passed &= actual.is_some_and(|a| finals.iter().any(|f| f == a));
1169    }
1170    if let Some(required) = &assertion.agents_include {
1171        let agents = collect_orchestration_agents(value);
1172        passed &= required
1173            .iter()
1174            .all(|agent| agents.iter().any(|a| a == agent));
1175    }
1176    if let Some(stages) = assertion.stages {
1177        let actual = value
1178            .get("stages")
1179            .and_then(Value::as_array)
1180            .map(Vec::len)
1181            .unwrap_or(0);
1182        passed &= actual == stages;
1183    }
1184    push_bool(
1185        "orchestration",
1186        passed,
1187        value.clone(),
1188        json!(assertion),
1189        details,
1190    );
1191}
1192
1193fn collect_orchestration_agents(value: &Value) -> Vec<String> {
1194    let mut agents = Vec::new();
1195    collect_agent_strings(value, &mut agents);
1196    agents.sort();
1197    agents.dedup();
1198    agents
1199}
1200
1201fn collect_agent_strings(value: &Value, agents: &mut Vec<String>) {
1202    match value {
1203        Value::Object(map) => {
1204            for (key, value) in map {
1205                if matches!(
1206                    key.as_str(),
1207                    "agent" | "agent_id" | "id" | "final_agent" | "to_agent" | "from_agent"
1208                ) {
1209                    if let Some(text) = value.as_str() {
1210                        agents.push(text.to_string());
1211                    }
1212                }
1213                collect_agent_strings(value, agents);
1214            }
1215        }
1216        Value::Array(values) => {
1217            for value in values {
1218                if let Some(text) = value.as_str() {
1219                    agents.push(text.to_string());
1220                }
1221                collect_agent_strings(value, agents);
1222            }
1223        }
1224        _ => {}
1225    }
1226}
1227
1228fn evaluate_observability(
1229    assertion: &ObservabilityAssertion,
1230    evidence: &TurnEvidence,
1231    details: &mut Vec<AssertionResultDetail>,
1232) {
1233    let report = evidence
1234        .observability
1235        .as_ref()
1236        .and_then(|o| o.report.as_ref());
1237    let Some(report) = report else {
1238        push_bool(
1239            "observability",
1240            false,
1241            json!(null),
1242            json!(assertion),
1243            details,
1244        );
1245        return;
1246    };
1247    let mut passed = true;
1248    if let Some(max) = assertion.total_llm_calls_lte {
1249        passed &= report.summary.total_llm_calls <= max;
1250    }
1251    if let Some(max) = assertion.total_tool_calls_lte {
1252        passed &= report.summary.total_tool_calls <= max;
1253    }
1254    if let Some(max) = assertion.total_tokens_lte {
1255        passed &= report.summary.total_tokens <= max;
1256    }
1257    if let Some(max) = assertion.total_cost_usd_lte {
1258        passed &= report.summary.total_cost_usd <= max;
1259    }
1260    for (purpose, path_assertion) in &assertion.purpose_counts {
1261        let count = report
1262            .by_purpose
1263            .iter()
1264            .find(|metric| metric.dimensions.get("purpose") == Some(purpose))
1265            .map(|metric| metric.count)
1266            .unwrap_or(0);
1267        passed &= path_matches(&json!({"count": count}), path_assertion);
1268    }
1269    for (status, path_assertion) in &assertion.status_counts {
1270        let count = report
1271            .configured
1272            .iter()
1273            .find(|metric| metric.dimensions.get("status") == Some(status))
1274            .map(|metric| metric.count)
1275            .unwrap_or(0);
1276        passed &= path_matches(&json!({"count": count}), path_assertion);
1277    }
1278    for dimension_assertion in &assertion.dimension_counts {
1279        let count: u64 = report
1280            .configured
1281            .iter()
1282            .filter(|metric| {
1283                dimension_assertion
1284                    .match_dimensions
1285                    .iter()
1286                    .all(|(key, value)| metric.dimensions.get(key) == Some(value))
1287            })
1288            .map(|metric| metric.count)
1289            .sum();
1290        passed &= path_matches(&json!({"count": count}), &dimension_assertion.assertion);
1291    }
1292    push_bool(
1293        "observability",
1294        passed,
1295        json!(report.summary),
1296        json!(assertion),
1297        details,
1298    );
1299}
1300
1301#[cfg(test)]
1302mod tests {
1303    use super::*;
1304    use crate::evidence::{FactsEvidence, ToolExecutionSource, TurnObservabilityEvidence};
1305    use ai_agents_observability::{
1306        AggregatedMetrics, CostBreakdown, CostStats, LatencyStats, ObservabilityReport,
1307        ReportSummary, TokenBreakdown, TokenStats,
1308    };
1309
1310    fn evidence() -> TurnEvidence {
1311        TurnEvidence {
1312            response_metadata: Some(json!({"intent":"greeting","score":0.9})),
1313            state: Some("ready".to_string()),
1314            state_history: vec![ai_agents_core::StateTransitionEvent {
1315                from: "start".to_string(),
1316                to: "ready".to_string(),
1317                reason: "test".to_string(),
1318                timestamp: chrono::Utc::now(),
1319            }],
1320            context: json!({"user":{"tier":"vip"}}),
1321            tool_executions: vec![ToolExecutionRecord {
1322                call_id: "call-1".to_string(),
1323                tool_id: "lookup_order".to_string(),
1324                requested_name: "lookup_order".to_string(),
1325                source: ToolExecutionSource::Mock,
1326                state: None,
1327                actor_id: Some("actor-1".to_string()),
1328                arguments_original: json!({"id":"ORD-1"}),
1329                arguments_executed: json!({"id":"ORD-1"}),
1330                success: true,
1331                output: Some(json!({"status":"cancellable"})),
1332                error: None,
1333                metadata: None,
1334                started_at: chrono::Utc::now(),
1335                duration_ms: 1,
1336                observability_span_id: None,
1337            }],
1338            skill: None,
1339            disambiguation: None,
1340            facts: Some(FactsEvidence {
1341                actor_id: Some("actor-1".to_string()),
1342                facts: vec![
1343                    json!({"category":"user_preference","content":"Prefers concise answers"}),
1344                ],
1345                before_count: None,
1346                after_count: Some(1),
1347            }),
1348            relationship: None,
1349            persona: None,
1350            orchestration: Some(json!({
1351                "type":"pipeline",
1352                "stages":[{"agent_id":"writer"},{"agent_id":"editor"}],
1353                "agents":["writer","editor"]
1354            })),
1355            observability: None,
1356        }
1357    }
1358
1359    #[tokio::test]
1360    async fn evaluates_structured_assertions() {
1361        let mut metadata = HashMap::new();
1362        metadata.insert("intent".to_string(), json!("greeting"));
1363        let assertion = Assertion {
1364            state: Some("ready".to_string()),
1365            state_history_contains: Some("ready".to_string()),
1366            response_contains: Some(StringList::One("Hello".to_string())),
1367            metadata_contains: Some(metadata),
1368            context_path: Some(PathAssertion {
1369                path: "user.tier".to_string(),
1370                eq: Some(json!("vip")),
1371                ..Default::default()
1372            }),
1373            tool_called: Some(ToolCalledAssertion::Object(ToolCalledObject {
1374                id: Some("lookup_order".to_string()),
1375                success: Some(true),
1376                result_path: Some(PathAssertion {
1377                    path: "status".to_string(),
1378                    eq: Some(json!("cancellable")),
1379                    ..Default::default()
1380                }),
1381                ..Default::default()
1382            })),
1383            facts_include: Some(FactsAssertion {
1384                actor: Some("actor-1".to_string()),
1385                category: Some("user_preference".to_string()),
1386                semantic: None,
1387            }),
1388            orchestration: Some(OrchestrationAssertion {
1389                pattern: Some("pipeline".to_string()),
1390                agents_include: Some(vec!["writer".to_string(), "editor".to_string()]),
1391                stages: Some(2),
1392                ..Default::default()
1393            }),
1394            ..Default::default()
1395        };
1396        let evidence = evidence();
1397        let result = evaluate_assertion(
1398            &assertion,
1399            AssertionEvalContext {
1400                evidence: &evidence,
1401                response: "Hello there",
1402                user_input: Some("Hello"),
1403                scenario_id: Some("test"),
1404                language: Some("en"),
1405                judge_resolver: None,
1406            },
1407        )
1408        .await;
1409        assert!(matches!(result, AssertionOutcome::Passed(_)));
1410    }
1411
1412    #[tokio::test]
1413    async fn facts_actor_mismatch_fails() {
1414        let assertion = Assertion {
1415            facts_include: Some(FactsAssertion {
1416                actor: Some("other".to_string()),
1417                category: Some("user_preference".to_string()),
1418                semantic: None,
1419            }),
1420            ..Default::default()
1421        };
1422        let evidence = evidence();
1423        let result = evaluate_assertion(
1424            &assertion,
1425            AssertionEvalContext {
1426                evidence: &evidence,
1427                response: "ok",
1428                user_input: None,
1429                scenario_id: None,
1430                language: None,
1431                judge_resolver: None,
1432            },
1433        )
1434        .await;
1435        assert!(matches!(result, AssertionOutcome::Failed(_)));
1436    }
1437
1438    #[tokio::test]
1439    async fn observability_dimension_counts_match_configured_metrics() {
1440        let mut evidence = evidence();
1441        let mut dimensions = HashMap::new();
1442        dimensions.insert("background".to_string(), "true".to_string());
1443        dimensions.insert("maintenance".to_string(), "facts".to_string());
1444        let metric = AggregatedMetrics {
1445            dimensions,
1446            count: 2,
1447            errors: 0,
1448            latency: LatencyStats::default(),
1449            tokens: TokenStats::default(),
1450            cost: CostStats::default(),
1451        };
1452        evidence.observability = Some(TurnObservabilityEvidence {
1453            trace_id: Some("trace".to_string()),
1454            span_ids: vec!["span".to_string()],
1455            report: Some(ObservabilityReport {
1456                summary: ReportSummary::default(),
1457                configured: vec![metric],
1458                by_model: vec![],
1459                by_purpose: vec![],
1460                by_language: vec![],
1461                by_state: vec![],
1462                by_agent: vec![],
1463                by_orchestration_pattern: vec![],
1464                cost_breakdown: CostBreakdown::default(),
1465                token_breakdown: TokenBreakdown::default(),
1466                dropped_events: 0,
1467            }),
1468        });
1469        let mut match_dimensions = HashMap::new();
1470        match_dimensions.insert("background".to_string(), "true".to_string());
1471        let assertion = Assertion {
1472            observability: Some(ObservabilityAssertion {
1473                dimension_counts: vec![ObservabilityDimensionAssertion {
1474                    match_dimensions,
1475                    assertion: PathAssertion {
1476                        path: "count".to_string(),
1477                        gte: Some(2.0),
1478                        ..Default::default()
1479                    },
1480                }],
1481                ..Default::default()
1482            }),
1483            ..Default::default()
1484        };
1485
1486        let result = evaluate_assertion(
1487            &assertion,
1488            AssertionEvalContext {
1489                evidence: &evidence,
1490                response: "ok",
1491                user_input: None,
1492                scenario_id: None,
1493                language: None,
1494                judge_resolver: None,
1495            },
1496        )
1497        .await;
1498
1499        assert!(matches!(result, AssertionOutcome::Passed(_)));
1500    }
1501}
ai_agents_eval/assertion.rs

ai_agents_eval/
assertion.rs