1use serde::{Deserialize, Serialize};
2use serde_json::{Value, json};
3use std::collections::HashMap;
4
5use crate::evidence::{DisambiguationStatus, ToolExecutionRecord, TurnEvidence};
6use crate::judge::{JudgeAssertion, JudgeInput, JudgeResolver};
7
8#[derive(Debug, Clone, Deserialize, Serialize, Default)]
10pub struct Assertion {
11 #[serde(default)]
13 pub state: Option<String>,
14 #[serde(default)]
16 pub state_in: Option<Vec<String>>,
17 #[serde(default)]
19 pub state_not: Option<String>,
20 #[serde(default)]
22 pub state_history_contains: Option<String>,
23 #[serde(default)]
25 pub response_contains: Option<StringList>,
26 #[serde(default)]
28 pub response_contains_any: Option<StringList>,
29 #[serde(default)]
31 pub response_not_contains: Option<StringList>,
32 #[serde(default)]
34 pub response_not_empty: Option<bool>,
35 #[serde(default)]
37 pub response_semantic: Option<JudgeAssertion>,
38 #[serde(default)]
40 pub disambiguation: Option<DisambiguationExpectation>,
41 #[serde(default)]
43 pub no_disambiguation: Option<bool>,
44 #[serde(default)]
46 pub tool_called: Option<ToolCalledAssertion>,
47 #[serde(default)]
49 pub tool_not_called: Option<String>,
50 #[serde(default)]
52 pub skill_triggered: Option<String>,
53 #[serde(default)]
55 pub metadata_contains: Option<HashMap<String, Value>>,
56 #[serde(default)]
58 pub metadata_path: Option<PathAssertion>,
59 #[serde(default)]
61 pub context_path: Option<PathAssertion>,
62 #[serde(default)]
64 pub facts_include: Option<FactsAssertion>,
65 #[serde(default)]
67 pub relationship: Option<RelationshipAssertion>,
68 #[serde(default)]
70 pub persona_secret_revealed: Option<SecretAssertion>,
71 #[serde(default)]
73 pub orchestration: Option<OrchestrationAssertion>,
74 #[serde(default)]
76 pub observability: Option<ObservabilityAssertion>,
77 #[serde(default)]
79 pub judge: Option<JudgeAssertion>,
80 #[serde(default)]
82 pub any: Option<Vec<Assertion>>,
83 #[serde(default)]
85 pub all: Option<Vec<Assertion>>,
86 #[serde(default)]
88 pub not: Option<Box<Assertion>>,
89}
90
91#[derive(Debug, Clone, Deserialize, Serialize)]
93#[serde(untagged)]
94pub enum StringList {
95 One(String),
96 Many(Vec<String>),
97}
98
99impl StringList {
100 fn items(&self) -> Vec<String> {
101 match self {
102 Self::One(value) => vec![value.clone()],
103 Self::Many(values) => values.clone(),
104 }
105 }
106}
107
108#[derive(Debug, Clone, Deserialize, Serialize)]
110#[serde(untagged)]
111pub enum ToolCalledAssertion {
112 Id(String),
113 Object(ToolCalledObject),
114}
115
116#[derive(Debug, Clone, Deserialize, Serialize, Default)]
118pub struct ToolCalledObject {
119 #[serde(default)]
121 pub id: Option<String>,
122 #[serde(default)]
124 pub count: Option<usize>,
125 #[serde(default)]
127 pub count_gte: Option<usize>,
128 #[serde(default)]
130 pub success: Option<bool>,
131 #[serde(default)]
133 pub source_in: Option<Vec<String>>,
134 #[serde(default)]
136 pub args: Option<PathAssertion>,
137 #[serde(default)]
139 pub args_original: Option<PathAssertion>,
140 #[serde(default)]
142 pub args_executed: Option<PathAssertion>,
143 #[serde(default)]
145 pub result_path: Option<PathAssertion>,
146}
147
148#[derive(Debug, Clone, Deserialize, Serialize)]
150#[serde(rename_all = "snake_case")]
151pub enum DisambiguationExpectation {
152 Triggered,
153 Skipped,
154 Clarified,
155 Abandoned,
156 GiveUp,
157 Escalated,
158 BestGuess,
159 Clear,
160}
161
162#[derive(Debug, Clone, Deserialize, Serialize, Default)]
164pub struct PathAssertion {
165 pub path: String,
167 #[serde(default)]
169 pub eq: Option<Value>,
170 #[serde(default)]
172 pub neq: Option<Value>,
173 #[serde(default, rename = "in")]
175 pub in_values: Option<Vec<Value>>,
176 #[serde(default)]
178 pub contains: Option<Value>,
179 #[serde(default)]
181 pub exists: Option<bool>,
182 #[serde(default)]
184 pub gte: Option<f64>,
185 #[serde(default)]
187 pub lte: Option<f64>,
188 #[serde(default)]
190 pub gt: Option<f64>,
191 #[serde(default)]
193 pub lt: Option<f64>,
194}
195
196#[derive(Debug, Clone, Deserialize, Serialize, Default)]
198pub struct FactsAssertion {
199 #[serde(default)]
201 pub actor: Option<String>,
202 #[serde(default)]
204 pub category: Option<String>,
205 #[serde(default)]
207 pub semantic: Option<String>,
208}
209
210#[derive(Debug, Clone, Deserialize, Serialize, Default)]
212pub struct RelationshipAssertion {
213 #[serde(default)]
215 pub actor: Option<String>,
216 #[serde(default)]
218 pub exists: Option<bool>,
219 #[serde(default)]
221 pub perspective: Option<String>,
222 #[serde(default)]
224 pub dimension: Option<String>,
225 #[serde(default)]
227 pub gte: Option<f64>,
228 #[serde(default)]
230 pub lte: Option<f64>,
231 #[serde(default)]
233 pub gt: Option<f64>,
234 #[serde(default)]
236 pub lt: Option<f64>,
237 #[serde(default)]
239 pub eq: Option<f64>,
240 #[serde(default)]
242 pub interaction_count_gte: Option<u64>,
243 #[serde(default)]
245 pub notable_event_count_gte: Option<usize>,
246}
247
248#[derive(Debug, Clone, Deserialize, Serialize)]
250#[serde(untagged)]
251pub enum SecretAssertion {
252 Bool(bool),
253 Id(String),
254}
255
256#[derive(Debug, Clone, Deserialize, Serialize, Default)]
258pub struct OrchestrationAssertion {
259 #[serde(default)]
261 pub pattern: Option<String>,
262 #[serde(default, rename = "type")]
264 pub type_name: Option<String>,
265 #[serde(default)]
267 pub final_agent_in: Option<Vec<String>>,
268 #[serde(default)]
270 pub agents_include: Option<Vec<String>>,
271 #[serde(default)]
273 pub stages: Option<usize>,
274}
275
276#[derive(Debug, Clone, Deserialize, Serialize, Default)]
278pub struct ObservabilityAssertion {
279 #[serde(default)]
281 pub total_llm_calls_lte: Option<u64>,
282 #[serde(default)]
284 pub total_tool_calls_lte: Option<u64>,
285 #[serde(default)]
287 pub total_tokens_lte: Option<u64>,
288 #[serde(default)]
290 pub total_cost_usd_lte: Option<f64>,
291 #[serde(default)]
293 pub purpose_counts: HashMap<String, PathAssertion>,
294 #[serde(default)]
296 pub status_counts: HashMap<String, PathAssertion>,
297 #[serde(default)]
299 pub dimension_counts: Vec<ObservabilityDimensionAssertion>,
300}
301
302#[derive(Debug, Clone, Deserialize, Serialize, Default)]
304pub struct ObservabilityDimensionAssertion {
305 #[serde(default)]
306 pub match_dimensions: HashMap<String, String>,
307 #[serde(rename = "assert")]
308 pub assertion: PathAssertion,
309}
310
311#[derive(Debug, Clone, Serialize)]
313pub struct AssertionResultDetail {
314 pub assertion: String,
316 pub passed: bool,
318 pub actual: Value,
320 pub expected: Value,
322 pub message: Option<String>,
324}
325
326pub enum AssertionOutcome {
328 Passed(Vec<AssertionResultDetail>),
329 Failed(Vec<AssertionResultDetail>),
330 Error(String),
331}
332
333impl AssertionResultDetail {
334 fn pass(name: impl Into<String>, actual: Value, expected: Value) -> Self {
335 Self {
336 assertion: name.into(),
337 passed: true,
338 actual,
339 expected,
340 message: None,
341 }
342 }
343
344 fn fail(
345 name: impl Into<String>,
346 actual: Value,
347 expected: Value,
348 message: impl Into<String>,
349 ) -> Self {
350 Self {
351 assertion: name.into(),
352 passed: false,
353 actual,
354 expected,
355 message: Some(message.into()),
356 }
357 }
358}
359
360#[derive(Clone, Copy)]
362pub struct AssertionEvalContext<'a> {
363 pub evidence: &'a TurnEvidence,
365 pub response: &'a str,
367 pub user_input: Option<&'a str>,
369 pub scenario_id: Option<&'a str>,
371 pub language: Option<&'a str>,
373 pub judge_resolver: Option<&'a JudgeResolver>,
375}
376
377pub async fn evaluate_assertion(
378 assertion: &Assertion,
379 context: AssertionEvalContext<'_>,
380) -> AssertionOutcome {
381 let mut details = Vec::new();
382
383 if let Some(children) = &assertion.all {
384 for child in children {
385 match evaluate_assertion_boxed(child, context).await {
386 AssertionOutcome::Passed(mut d) => details.append(&mut d),
387 AssertionOutcome::Failed(mut d) => {
388 details.append(&mut d);
389 return AssertionOutcome::Failed(details);
390 }
391 AssertionOutcome::Error(e) => return AssertionOutcome::Error(e),
392 }
393 }
394 details.push(AssertionResultDetail::pass("all", json!(true), json!(true)));
395 }
396
397 if let Some(children) = &assertion.any {
398 let mut failures = Vec::new();
399 for child in children {
400 match evaluate_assertion_boxed(child, context).await {
401 AssertionOutcome::Passed(mut d) => {
402 details.append(&mut d);
403 details.push(AssertionResultDetail::pass("any", json!(true), json!(true)));
404 return AssertionOutcome::Passed(details);
405 }
406 AssertionOutcome::Failed(mut d) => failures.append(&mut d),
407 AssertionOutcome::Error(e) => failures.push(AssertionResultDetail::fail(
408 "any_branch_error",
409 json!(e),
410 json!("pass"),
411 "branch error",
412 )),
413 }
414 }
415 details.extend(failures);
416 details.push(AssertionResultDetail::fail(
417 "any",
418 json!(false),
419 json!(true),
420 "no branch passed",
421 ));
422 }
423
424 if let Some(child) = &assertion.not {
425 match evaluate_assertion_boxed(child, context).await {
426 AssertionOutcome::Passed(_) => details.push(AssertionResultDetail::fail(
427 "not",
428 json!(true),
429 json!(false),
430 "child assertion passed",
431 )),
432 AssertionOutcome::Failed(_) => details.push(AssertionResultDetail::pass(
433 "not",
434 json!(false),
435 json!(false),
436 )),
437 AssertionOutcome::Error(e) => return AssertionOutcome::Error(e),
438 }
439 }
440
441 evaluate_simple(assertion, context, &mut details).await;
442
443 if details.iter().any(|d| !d.passed) {
444 AssertionOutcome::Failed(details)
445 } else {
446 AssertionOutcome::Passed(details)
447 }
448}
449
450fn evaluate_assertion_boxed<'a>(
451 assertion: &'a Assertion,
452 context: AssertionEvalContext<'a>,
453) -> std::pin::Pin<Box<dyn std::future::Future<Output = AssertionOutcome> + Send + 'a>> {
454 Box::pin(evaluate_assertion(assertion, context))
455}
456
457async fn evaluate_simple(
458 assertion: &Assertion,
459 context: AssertionEvalContext<'_>,
460 details: &mut Vec<AssertionResultDetail>,
461) {
462 let evidence = context.evidence;
463 let response = context.response;
464 if let Some(expected) = &assertion.state {
465 check_eq("state", evidence.state.clone(), expected.clone(), details);
466 }
467 if let Some(expected) = &assertion.state_in {
468 push_bool(
469 "state_in",
470 evidence
471 .state
472 .as_ref()
473 .is_some_and(|s| expected.contains(s)),
474 json!(evidence.state),
475 json!(expected),
476 details,
477 );
478 }
479 if let Some(expected) = &assertion.state_not {
480 push_bool(
481 "state_not",
482 evidence.state.as_ref().is_none_or(|s| s != expected),
483 json!(evidence.state),
484 json!(expected),
485 details,
486 );
487 }
488 if let Some(expected) = &assertion.state_history_contains {
489 let passed = evidence
490 .state_history
491 .iter()
492 .any(|event| &event.to == expected || &event.from == expected);
493 push_bool(
494 "state_history_contains",
495 passed,
496 json!(evidence.state_history),
497 json!(expected),
498 details,
499 );
500 }
501 if let Some(expected) = &assertion.response_contains {
502 for item in expected.items() {
503 push_bool(
504 "response_contains",
505 response.contains(&item),
506 json!(response),
507 json!(item),
508 details,
509 );
510 }
511 }
512 if let Some(expected) = &assertion.response_contains_any {
513 let items = expected.items();
514 push_bool(
515 "response_contains_any",
516 items.iter().any(|item| response.contains(item)),
517 json!(response),
518 json!(items),
519 details,
520 );
521 }
522 if let Some(expected) = &assertion.response_not_contains {
523 for item in expected.items() {
524 push_bool(
525 "response_not_contains",
526 !response.contains(&item),
527 json!(response),
528 json!(item),
529 details,
530 );
531 }
532 }
533 if let Some(expected) = assertion.response_not_empty {
534 push_bool(
535 "response_not_empty",
536 (!response.trim().is_empty()) == expected,
537 json!(response),
538 json!(expected),
539 details,
540 );
541 }
542 if let Some(expected) = &assertion.disambiguation {
543 let actual = evidence.disambiguation.as_ref().map(|d| &d.status);
544 push_bool(
545 "disambiguation",
546 actual.is_some_and(|status| disambiguation_matches(status, expected)),
547 json!(actual),
548 json!(expected),
549 details,
550 );
551 }
552 if let Some(expected) = assertion.no_disambiguation {
553 let triggered = evidence.disambiguation.as_ref().is_some_and(|d| {
554 matches!(
555 d.status,
556 DisambiguationStatus::Triggered
557 | DisambiguationStatus::Clarified
558 | DisambiguationStatus::BestGuess
559 )
560 });
561 push_bool(
562 "no_disambiguation",
563 (!triggered) == expected,
564 json!(!triggered),
565 json!(expected),
566 details,
567 );
568 }
569 if let Some(tool) = &assertion.tool_called {
570 evaluate_tool_called(tool, evidence, details);
571 }
572 if let Some(tool_id) = &assertion.tool_not_called {
573 let passed = !evidence
574 .tool_executions
575 .iter()
576 .any(|record| &record.tool_id == tool_id || &record.requested_name == tool_id);
577 push_bool(
578 "tool_not_called",
579 passed,
580 json!(tool_id),
581 json!("not called"),
582 details,
583 );
584 }
585 if let Some(skill_id) = &assertion.skill_triggered {
586 let passed = evidence.skill.as_ref().is_some_and(|skill| {
587 skill.selected_skill_id.as_deref() == Some(skill_id)
588 || skill.executed_skill_id.as_deref() == Some(skill_id)
589 });
590 push_bool(
591 "skill_triggered",
592 passed,
593 json!(evidence.skill),
594 json!(skill_id),
595 details,
596 );
597 }
598 if let Some(expected) = &assertion.metadata_contains {
599 evaluate_metadata_contains(expected, evidence, details);
600 }
601 if let Some(path) = &assertion.metadata_path {
602 evaluate_path(
603 "metadata_path",
604 evidence.response_metadata.as_ref(),
605 path,
606 details,
607 );
608 }
609 if let Some(path) = &assertion.context_path {
610 evaluate_path("context_path", Some(&evidence.context), path, details);
611 }
612 if let Some(expected) = &assertion.facts_include {
613 evaluate_facts(expected, evidence, context.judge_resolver, details).await;
614 }
615 if let Some(expected) = &assertion.relationship {
616 evaluate_relationship(expected, evidence, details);
617 }
618 if let Some(expected) = &assertion.persona_secret_revealed {
619 evaluate_secret(expected, evidence, details);
620 }
621 if let Some(expected) = &assertion.orchestration {
622 evaluate_orchestration(expected, evidence, details);
623 }
624 if let Some(expected) = &assertion.observability {
625 evaluate_observability(expected, evidence, details);
626 }
627 if let Some(criteria) = assertion
628 .judge
629 .as_ref()
630 .or(assertion.response_semantic.as_ref())
631 {
632 if let Some(resolver) = context.judge_resolver {
633 match resolver.resolve(criteria.llm.as_deref()) {
634 Ok(judge) => match judge
635 .evaluate_input(
636 JudgeInput {
637 response,
638 user_input: context.user_input,
639 scenario_id: context.scenario_id,
640 language: context.language,
641 },
642 criteria,
643 )
644 .await
645 {
646 Ok(result) => push_bool(
647 "judge",
648 result.passed,
649 json!(result.overall_score),
650 json!(criteria.pass_threshold),
651 details,
652 ),
653 Err(error) => details.push(AssertionResultDetail::fail(
654 "judge",
655 json!(error.to_string()),
656 json!("valid judge result"),
657 "judge failed",
658 )),
659 },
660 Err(error) => details.push(AssertionResultDetail::fail(
661 "judge",
662 json!(error.to_string()),
663 json!("available judge LLM"),
664 "judge failed",
665 )),
666 }
667 } else {
668 details.push(AssertionResultDetail::fail(
669 "judge",
670 json!(null),
671 json!("judge configured"),
672 "no judge LLM available",
673 ));
674 }
675 }
676}
677
678fn check_eq<T: PartialEq + Serialize>(
679 name: &str,
680 actual: Option<T>,
681 expected: T,
682 details: &mut Vec<AssertionResultDetail>,
683) {
684 push_bool(
685 name,
686 actual.as_ref().is_some_and(|a| *a == expected),
687 json!(actual),
688 json!(expected),
689 details,
690 );
691}
692fn push_bool(
693 name: &str,
694 passed: bool,
695 actual: Value,
696 expected: Value,
697 details: &mut Vec<AssertionResultDetail>,
698) {
699 if passed {
700 details.push(AssertionResultDetail::pass(name, actual, expected));
701 } else {
702 details.push(AssertionResultDetail::fail(
703 name,
704 actual,
705 expected,
706 "assertion failed",
707 ));
708 }
709}
710
711fn disambiguation_matches(
712 actual: &DisambiguationStatus,
713 expected: &DisambiguationExpectation,
714) -> bool {
715 matches!(
716 (actual, expected),
717 (
718 DisambiguationStatus::Triggered,
719 DisambiguationExpectation::Triggered
720 ) | (
721 DisambiguationStatus::Skipped,
722 DisambiguationExpectation::Skipped
723 ) | (
724 DisambiguationStatus::Clarified,
725 DisambiguationExpectation::Clarified
726 ) | (
727 DisambiguationStatus::Abandoned,
728 DisambiguationExpectation::Abandoned
729 ) | (
730 DisambiguationStatus::GiveUp,
731 DisambiguationExpectation::GiveUp
732 ) | (
733 DisambiguationStatus::Escalated,
734 DisambiguationExpectation::Escalated
735 ) | (
736 DisambiguationStatus::BestGuess,
737 DisambiguationExpectation::BestGuess
738 ) | (
739 DisambiguationStatus::Clear,
740 DisambiguationExpectation::Clear
741 )
742 )
743}
744
745fn evaluate_tool_called(
746 assertion: &ToolCalledAssertion,
747 evidence: &TurnEvidence,
748 details: &mut Vec<AssertionResultDetail>,
749) {
750 let (id, object) = match assertion {
751 ToolCalledAssertion::Id(id) => (Some(id.as_str()), None),
752 ToolCalledAssertion::Object(object) => (object.id.as_deref(), Some(object)),
753 };
754 let mut records: Vec<&ToolExecutionRecord> = evidence.tool_executions.iter().collect();
755 if let Some(id) = id {
756 records.retain(|record| record.tool_id == id || record.requested_name == id);
757 }
758 if let Some(object) = object {
759 if let Some(success) = object.success {
760 records.retain(|record| record.success == success);
761 }
762 if let Some(sources) = &object.source_in {
763 records.retain(|record| {
764 sources
765 .iter()
766 .any(|source| *source == serde_plain_source(&record.source))
767 });
768 }
769 }
770 let count = records.len();
771 let mut passed = count > 0;
772 if let Some(object) = object {
773 if let Some(expected) = object.count {
774 passed &= count == expected;
775 }
776 if let Some(expected) = object.count_gte {
777 passed &= count >= expected;
778 }
779 if let Some(path) = object.args.as_ref().or(object.args_executed.as_ref()) {
780 passed &= records
781 .iter()
782 .any(|record| path_matches(&record.arguments_executed, path));
783 }
784 if let Some(path) = &object.args_original {
785 passed &= records
786 .iter()
787 .any(|record| path_matches(&record.arguments_original, path));
788 }
789 if let Some(path) = &object.result_path {
790 passed &= records.iter().any(|record| {
791 record
792 .output
793 .as_ref()
794 .is_some_and(|value| path_matches(value, path))
795 });
796 }
797 }
798 push_bool(
799 "tool_called",
800 passed,
801 json!(count),
802 json!(assertion),
803 details,
804 );
805}
806
807fn serde_plain_source(source: &crate::evidence::ToolExecutionSource) -> String {
808 serde_json::to_string(source)
809 .unwrap_or_default()
810 .trim_matches('"')
811 .to_string()
812}
813fn evaluate_metadata_contains(
814 expected: &HashMap<String, Value>,
815 evidence: &TurnEvidence,
816 details: &mut Vec<AssertionResultDetail>,
817) {
818 let metadata = evidence.response_metadata.as_ref();
819 let passed = metadata.is_some_and(|metadata| {
820 expected
821 .iter()
822 .all(|(key, expected)| metadata.get(key) == Some(expected))
823 }) || (expected.is_empty() && metadata.is_none());
824 push_bool(
825 "metadata_contains",
826 passed,
827 json!(metadata),
828 json!(expected),
829 details,
830 );
831}
832
833fn evaluate_path(
834 name: &str,
835 root: Option<&Value>,
836 assertion: &PathAssertion,
837 details: &mut Vec<AssertionResultDetail>,
838) {
839 let actual = root.and_then(|value| get_path(value, &assertion.path));
840 push_bool(
841 name,
842 path_actual_matches(actual, assertion),
843 json!(actual),
844 json!(assertion),
845 details,
846 );
847}
848fn path_matches(root: &Value, assertion: &PathAssertion) -> bool {
849 path_actual_matches(get_path(root, &assertion.path), assertion)
850}
851
852fn path_actual_matches(actual: Option<&Value>, assertion: &PathAssertion) -> bool {
853 if let Some(exists) = assertion.exists {
854 if exists != actual.is_some() {
855 return false;
856 }
857 }
858 let Some(actual) = actual else {
859 return assertion.exists == Some(false);
860 };
861 if let Some(expected) = &assertion.eq {
862 if actual != expected {
863 return false;
864 }
865 }
866 if let Some(expected) = &assertion.neq {
867 if actual == expected {
868 return false;
869 }
870 }
871 if let Some(values) = &assertion.in_values {
872 if !values.contains(actual) {
873 return false;
874 }
875 }
876 if let Some(expected) = &assertion.contains {
877 let contains = match (actual, expected) {
878 (Value::String(a), Value::String(e)) => a.contains(e),
879 (Value::Array(arr), e) => arr.contains(e),
880 _ => false,
881 };
882 if !contains {
883 return false;
884 }
885 }
886 if let Some(expected) = assertion.gte {
887 if actual.as_f64().unwrap_or(f64::NAN) < expected {
888 return false;
889 }
890 }
891 if let Some(expected) = assertion.lte {
892 if actual.as_f64().unwrap_or(f64::NAN) > expected {
893 return false;
894 }
895 }
896 if let Some(expected) = assertion.gt {
897 if actual.as_f64().unwrap_or(f64::NAN) <= expected {
898 return false;
899 }
900 }
901 if let Some(expected) = assertion.lt {
902 if actual.as_f64().unwrap_or(f64::NAN) >= expected {
903 return false;
904 }
905 }
906 true
907}
908
909fn get_path<'a>(value: &'a Value, path: &str) -> Option<&'a Value> {
910 if path.is_empty() {
911 return Some(value);
912 }
913 let mut current = value;
914 for part in path.split('.') {
915 current = current.get(part)?;
916 }
917 Some(current)
918}
919
920async fn evaluate_facts(
921 assertion: &FactsAssertion,
922 evidence: &TurnEvidence,
923 judge_resolver: Option<&JudgeResolver>,
924 details: &mut Vec<AssertionResultDetail>,
925) {
926 let Some(fact_evidence) = &evidence.facts else {
927 push_bool(
928 "facts_include",
929 false,
930 json!(null),
931 json!(assertion),
932 details,
933 );
934 return;
935 };
936 if let Some(actor) = &assertion.actor {
937 if fact_evidence.actor_id.as_deref() != Some(actor.as_str()) {
938 push_bool(
939 "facts_include",
940 false,
941 json!(fact_evidence.actor_id),
942 json!(actor),
943 details,
944 );
945 return;
946 }
947 }
948 let facts: Vec<Value> = fact_evidence
949 .facts
950 .iter()
951 .filter(|fact| {
952 assertion.category.as_ref().is_none_or(|category| {
953 fact.get("category")
954 .map(|value| value.to_string().trim_matches('"').to_string())
955 .is_some_and(|actual| actual == *category || actual.ends_with(category))
956 })
957 })
958 .cloned()
959 .collect();
960 let mut passed = !facts.is_empty();
961 if let Some(semantic) = &assertion.semantic {
962 if let Some(resolver) = judge_resolver {
963 match resolver.resolve(None) {
964 Ok(judge) => {
965 let criteria = JudgeAssertion {
966 llm: None,
967 pass_threshold: 0.75,
968 criteria: vec![crate::judge::JudgeCriterion::Text(format!(
969 "The fact set supports this claim: {}",
970 semantic
971 ))],
972 };
973 let fact_text = serde_json::to_string(&facts).unwrap_or_default();
974 match judge.evaluate(&fact_text, &criteria).await {
975 Ok(result) => passed &= result.passed,
976 Err(error) => {
977 details.push(AssertionResultDetail::fail(
978 "facts_include",
979 json!(error.to_string()),
980 json!(semantic),
981 "fact semantic judge failed",
982 ));
983 return;
984 }
985 }
986 }
987 Err(error) => {
988 details.push(AssertionResultDetail::fail(
989 "facts_include",
990 json!(error.to_string()),
991 json!(semantic),
992 "fact semantic judge failed",
993 ));
994 return;
995 }
996 }
997 } else {
998 details.push(AssertionResultDetail::fail(
999 "facts_include",
1000 json!(null),
1001 json!(semantic),
1002 "semantic fact assertion requires a judge LLM",
1003 ));
1004 return;
1005 }
1006 }
1007 push_bool(
1008 "facts_include",
1009 passed,
1010 json!(facts),
1011 json!(assertion),
1012 details,
1013 );
1014}
1015
1016fn evaluate_relationship(
1017 assertion: &RelationshipAssertion,
1018 evidence: &TurnEvidence,
1019 details: &mut Vec<AssertionResultDetail>,
1020) {
1021 let Some(rel) = &evidence.relationship else {
1022 push_bool(
1023 "relationship",
1024 assertion.exists == Some(false),
1025 json!(null),
1026 json!(assertion),
1027 details,
1028 );
1029 return;
1030 };
1031 if let Some(actor) = &assertion.actor {
1032 if rel.actor_id.as_deref() != Some(actor.as_str()) {
1033 push_bool(
1034 "relationship",
1035 false,
1036 json!(rel.actor_id),
1037 json!(actor),
1038 details,
1039 );
1040 return;
1041 }
1042 }
1043 let current = rel.current.as_ref();
1044 let mut passed = assertion
1045 .exists
1046 .map(|expected| expected == current.is_some())
1047 .unwrap_or(true);
1048 let perspective = assertion.perspective.as_deref().unwrap_or("agent_to_actor");
1049 if !rel.available_perspectives.iter().any(|p| p == perspective) {
1050 details.push(AssertionResultDetail::fail(
1051 "relationship",
1052 json!(rel.available_perspectives),
1053 json!(perspective),
1054 "relationship perspective unavailable for model",
1055 ));
1056 return;
1057 }
1058 if let Some(count) = assertion.interaction_count_gte {
1059 let actual = current
1060 .and_then(|v| v.get("interaction_count"))
1061 .and_then(Value::as_u64)
1062 .unwrap_or(0);
1063 passed &= actual >= count;
1064 }
1065 if let Some(count) = assertion.notable_event_count_gte {
1066 let actual = current
1067 .and_then(|v| v.get("notable_events"))
1068 .and_then(Value::as_array)
1069 .map(Vec::len)
1070 .unwrap_or(0);
1071 passed &= actual >= count;
1072 }
1073 if let Some(dimension) = &assertion.dimension {
1074 let value = relationship_dimension_value(current, perspective, dimension);
1075 let mut dim_pass = value.is_some();
1076 if let Some(v) = assertion.gte {
1077 dim_pass &= value.unwrap_or(f64::NAN) >= v;
1078 }
1079 if let Some(v) = assertion.lte {
1080 dim_pass &= value.unwrap_or(f64::NAN) <= v;
1081 }
1082 if let Some(v) = assertion.gt {
1083 dim_pass &= value.unwrap_or(f64::NAN) > v;
1084 }
1085 if let Some(v) = assertion.lt {
1086 dim_pass &= value.unwrap_or(f64::NAN) < v;
1087 }
1088 if let Some(v) = assertion.eq {
1089 dim_pass &= (value.unwrap_or(f64::NAN) - v).abs() < f64::EPSILON;
1090 }
1091 passed &= dim_pass;
1092 }
1093 push_bool(
1094 "relationship",
1095 passed,
1096 json!(current),
1097 json!(assertion),
1098 details,
1099 );
1100}
1101
1102fn relationship_dimension_value(
1103 current: Option<&Value>,
1104 perspective: &str,
1105 dimension: &str,
1106) -> Option<f64> {
1107 let current = current?;
1108 match perspective {
1109 "agent_to_actor" => current.get("dimensions")?.get(dimension)?.as_f64(),
1110 "perceived_actor_to_agent" => current
1111 .get("perceived_actor_to_agent")?
1112 .get(dimension)?
1113 .as_f64(),
1114 "mutual" => current.get("dimensions")?.get(dimension)?.as_f64(),
1115 _ => None,
1116 }
1117}
1118
1119fn evaluate_secret(
1120 assertion: &SecretAssertion,
1121 evidence: &TurnEvidence,
1122 details: &mut Vec<AssertionResultDetail>,
1123) {
1124 let persona = evidence.persona.as_ref();
1125 let actual = persona.is_some_and(|p| p.secret_revealed);
1126 let passed = match assertion {
1127 SecretAssertion::Bool(expected) => actual == *expected,
1128 SecretAssertion::Id(id) => persona.is_some_and(|p| p.revealed_secret_ids.contains(id)),
1129 };
1130 push_bool(
1131 "persona_secret_revealed",
1132 passed,
1133 json!(actual),
1134 json!(assertion),
1135 details,
1136 );
1137}
1138
1139fn evaluate_orchestration(
1140 assertion: &OrchestrationAssertion,
1141 evidence: &TurnEvidence,
1142 details: &mut Vec<AssertionResultDetail>,
1143) {
1144 let Some(value) = &evidence.orchestration else {
1145 push_bool(
1146 "orchestration",
1147 false,
1148 json!(null),
1149 json!(assertion),
1150 details,
1151 );
1152 return;
1153 };
1154 let mut passed = true;
1155 if let Some(pattern) = assertion.pattern.as_ref().or(assertion.type_name.as_ref()) {
1156 passed &= value
1157 .get("type")
1158 .or_else(|| value.get("pattern"))
1159 .and_then(Value::as_str)
1160 == Some(pattern.as_str());
1161 }
1162 if let Some(finals) = &assertion.final_agent_in {
1163 let actual = value
1164 .get("final_agent")
1165 .or_else(|| value.get("to_agent"))
1166 .or_else(|| value.get("agent"))
1167 .and_then(Value::as_str);
1168 passed &= actual.is_some_and(|a| finals.iter().any(|f| f == a));
1169 }
1170 if let Some(required) = &assertion.agents_include {
1171 let agents = collect_orchestration_agents(value);
1172 passed &= required
1173 .iter()
1174 .all(|agent| agents.iter().any(|a| a == agent));
1175 }
1176 if let Some(stages) = assertion.stages {
1177 let actual = value
1178 .get("stages")
1179 .and_then(Value::as_array)
1180 .map(Vec::len)
1181 .unwrap_or(0);
1182 passed &= actual == stages;
1183 }
1184 push_bool(
1185 "orchestration",
1186 passed,
1187 value.clone(),
1188 json!(assertion),
1189 details,
1190 );
1191}
1192
1193fn collect_orchestration_agents(value: &Value) -> Vec<String> {
1194 let mut agents = Vec::new();
1195 collect_agent_strings(value, &mut agents);
1196 agents.sort();
1197 agents.dedup();
1198 agents
1199}
1200
1201fn collect_agent_strings(value: &Value, agents: &mut Vec<String>) {
1202 match value {
1203 Value::Object(map) => {
1204 for (key, value) in map {
1205 if matches!(
1206 key.as_str(),
1207 "agent" | "agent_id" | "id" | "final_agent" | "to_agent" | "from_agent"
1208 ) {
1209 if let Some(text) = value.as_str() {
1210 agents.push(text.to_string());
1211 }
1212 }
1213 collect_agent_strings(value, agents);
1214 }
1215 }
1216 Value::Array(values) => {
1217 for value in values {
1218 if let Some(text) = value.as_str() {
1219 agents.push(text.to_string());
1220 }
1221 collect_agent_strings(value, agents);
1222 }
1223 }
1224 _ => {}
1225 }
1226}
1227
1228fn evaluate_observability(
1229 assertion: &ObservabilityAssertion,
1230 evidence: &TurnEvidence,
1231 details: &mut Vec<AssertionResultDetail>,
1232) {
1233 let report = evidence
1234 .observability
1235 .as_ref()
1236 .and_then(|o| o.report.as_ref());
1237 let Some(report) = report else {
1238 push_bool(
1239 "observability",
1240 false,
1241 json!(null),
1242 json!(assertion),
1243 details,
1244 );
1245 return;
1246 };
1247 let mut passed = true;
1248 if let Some(max) = assertion.total_llm_calls_lte {
1249 passed &= report.summary.total_llm_calls <= max;
1250 }
1251 if let Some(max) = assertion.total_tool_calls_lte {
1252 passed &= report.summary.total_tool_calls <= max;
1253 }
1254 if let Some(max) = assertion.total_tokens_lte {
1255 passed &= report.summary.total_tokens <= max;
1256 }
1257 if let Some(max) = assertion.total_cost_usd_lte {
1258 passed &= report.summary.total_cost_usd <= max;
1259 }
1260 for (purpose, path_assertion) in &assertion.purpose_counts {
1261 let count = report
1262 .by_purpose
1263 .iter()
1264 .find(|metric| metric.dimensions.get("purpose") == Some(purpose))
1265 .map(|metric| metric.count)
1266 .unwrap_or(0);
1267 passed &= path_matches(&json!({"count": count}), path_assertion);
1268 }
1269 for (status, path_assertion) in &assertion.status_counts {
1270 let count = report
1271 .configured
1272 .iter()
1273 .find(|metric| metric.dimensions.get("status") == Some(status))
1274 .map(|metric| metric.count)
1275 .unwrap_or(0);
1276 passed &= path_matches(&json!({"count": count}), path_assertion);
1277 }
1278 for dimension_assertion in &assertion.dimension_counts {
1279 let count: u64 = report
1280 .configured
1281 .iter()
1282 .filter(|metric| {
1283 dimension_assertion
1284 .match_dimensions
1285 .iter()
1286 .all(|(key, value)| metric.dimensions.get(key) == Some(value))
1287 })
1288 .map(|metric| metric.count)
1289 .sum();
1290 passed &= path_matches(&json!({"count": count}), &dimension_assertion.assertion);
1291 }
1292 push_bool(
1293 "observability",
1294 passed,
1295 json!(report.summary),
1296 json!(assertion),
1297 details,
1298 );
1299}
1300
1301#[cfg(test)]
1302mod tests {
1303 use super::*;
1304 use crate::evidence::{FactsEvidence, ToolExecutionSource, TurnObservabilityEvidence};
1305 use ai_agents_observability::{
1306 AggregatedMetrics, CostBreakdown, CostStats, LatencyStats, ObservabilityReport,
1307 ReportSummary, TokenBreakdown, TokenStats,
1308 };
1309
1310 fn evidence() -> TurnEvidence {
1311 TurnEvidence {
1312 response_metadata: Some(json!({"intent":"greeting","score":0.9})),
1313 state: Some("ready".to_string()),
1314 state_history: vec![ai_agents_core::StateTransitionEvent {
1315 from: "start".to_string(),
1316 to: "ready".to_string(),
1317 reason: "test".to_string(),
1318 timestamp: chrono::Utc::now(),
1319 }],
1320 context: json!({"user":{"tier":"vip"}}),
1321 tool_executions: vec![ToolExecutionRecord {
1322 call_id: "call-1".to_string(),
1323 tool_id: "lookup_order".to_string(),
1324 requested_name: "lookup_order".to_string(),
1325 source: ToolExecutionSource::Mock,
1326 state: None,
1327 actor_id: Some("actor-1".to_string()),
1328 arguments_original: json!({"id":"ORD-1"}),
1329 arguments_executed: json!({"id":"ORD-1"}),
1330 success: true,
1331 output: Some(json!({"status":"cancellable"})),
1332 error: None,
1333 metadata: None,
1334 started_at: chrono::Utc::now(),
1335 duration_ms: 1,
1336 observability_span_id: None,
1337 }],
1338 skill: None,
1339 disambiguation: None,
1340 facts: Some(FactsEvidence {
1341 actor_id: Some("actor-1".to_string()),
1342 facts: vec![
1343 json!({"category":"user_preference","content":"Prefers concise answers"}),
1344 ],
1345 before_count: None,
1346 after_count: Some(1),
1347 }),
1348 relationship: None,
1349 persona: None,
1350 orchestration: Some(json!({
1351 "type":"pipeline",
1352 "stages":[{"agent_id":"writer"},{"agent_id":"editor"}],
1353 "agents":["writer","editor"]
1354 })),
1355 observability: None,
1356 }
1357 }
1358
1359 #[tokio::test]
1360 async fn evaluates_structured_assertions() {
1361 let mut metadata = HashMap::new();
1362 metadata.insert("intent".to_string(), json!("greeting"));
1363 let assertion = Assertion {
1364 state: Some("ready".to_string()),
1365 state_history_contains: Some("ready".to_string()),
1366 response_contains: Some(StringList::One("Hello".to_string())),
1367 metadata_contains: Some(metadata),
1368 context_path: Some(PathAssertion {
1369 path: "user.tier".to_string(),
1370 eq: Some(json!("vip")),
1371 ..Default::default()
1372 }),
1373 tool_called: Some(ToolCalledAssertion::Object(ToolCalledObject {
1374 id: Some("lookup_order".to_string()),
1375 success: Some(true),
1376 result_path: Some(PathAssertion {
1377 path: "status".to_string(),
1378 eq: Some(json!("cancellable")),
1379 ..Default::default()
1380 }),
1381 ..Default::default()
1382 })),
1383 facts_include: Some(FactsAssertion {
1384 actor: Some("actor-1".to_string()),
1385 category: Some("user_preference".to_string()),
1386 semantic: None,
1387 }),
1388 orchestration: Some(OrchestrationAssertion {
1389 pattern: Some("pipeline".to_string()),
1390 agents_include: Some(vec!["writer".to_string(), "editor".to_string()]),
1391 stages: Some(2),
1392 ..Default::default()
1393 }),
1394 ..Default::default()
1395 };
1396 let evidence = evidence();
1397 let result = evaluate_assertion(
1398 &assertion,
1399 AssertionEvalContext {
1400 evidence: &evidence,
1401 response: "Hello there",
1402 user_input: Some("Hello"),
1403 scenario_id: Some("test"),
1404 language: Some("en"),
1405 judge_resolver: None,
1406 },
1407 )
1408 .await;
1409 assert!(matches!(result, AssertionOutcome::Passed(_)));
1410 }
1411
1412 #[tokio::test]
1413 async fn facts_actor_mismatch_fails() {
1414 let assertion = Assertion {
1415 facts_include: Some(FactsAssertion {
1416 actor: Some("other".to_string()),
1417 category: Some("user_preference".to_string()),
1418 semantic: None,
1419 }),
1420 ..Default::default()
1421 };
1422 let evidence = evidence();
1423 let result = evaluate_assertion(
1424 &assertion,
1425 AssertionEvalContext {
1426 evidence: &evidence,
1427 response: "ok",
1428 user_input: None,
1429 scenario_id: None,
1430 language: None,
1431 judge_resolver: None,
1432 },
1433 )
1434 .await;
1435 assert!(matches!(result, AssertionOutcome::Failed(_)));
1436 }
1437
1438 #[tokio::test]
1439 async fn observability_dimension_counts_match_configured_metrics() {
1440 let mut evidence = evidence();
1441 let mut dimensions = HashMap::new();
1442 dimensions.insert("background".to_string(), "true".to_string());
1443 dimensions.insert("maintenance".to_string(), "facts".to_string());
1444 let metric = AggregatedMetrics {
1445 dimensions,
1446 count: 2,
1447 errors: 0,
1448 latency: LatencyStats::default(),
1449 tokens: TokenStats::default(),
1450 cost: CostStats::default(),
1451 };
1452 evidence.observability = Some(TurnObservabilityEvidence {
1453 trace_id: Some("trace".to_string()),
1454 span_ids: vec!["span".to_string()],
1455 report: Some(ObservabilityReport {
1456 summary: ReportSummary::default(),
1457 configured: vec![metric],
1458 by_model: vec![],
1459 by_purpose: vec![],
1460 by_language: vec![],
1461 by_state: vec![],
1462 by_agent: vec![],
1463 by_orchestration_pattern: vec![],
1464 cost_breakdown: CostBreakdown::default(),
1465 token_breakdown: TokenBreakdown::default(),
1466 dropped_events: 0,
1467 }),
1468 });
1469 let mut match_dimensions = HashMap::new();
1470 match_dimensions.insert("background".to_string(), "true".to_string());
1471 let assertion = Assertion {
1472 observability: Some(ObservabilityAssertion {
1473 dimension_counts: vec![ObservabilityDimensionAssertion {
1474 match_dimensions,
1475 assertion: PathAssertion {
1476 path: "count".to_string(),
1477 gte: Some(2.0),
1478 ..Default::default()
1479 },
1480 }],
1481 ..Default::default()
1482 }),
1483 ..Default::default()
1484 };
1485
1486 let result = evaluate_assertion(
1487 &assertion,
1488 AssertionEvalContext {
1489 evidence: &evidence,
1490 response: "ok",
1491 user_input: None,
1492 scenario_id: None,
1493 language: None,
1494 judge_resolver: None,
1495 },
1496 )
1497 .await;
1498
1499 assert!(matches!(result, AssertionOutcome::Passed(_)));
1500 }
1501}