1use crate::error::EvaluationError;
2use crate::evaluate::store::{AssertionResultStore, LLMResponseStore, TaskRegistry, TaskType};
3use crate::evaluate::trace::TraceContextBuilder;
4use crate::tasks::trace::execute_trace_assertions;
5use crate::tasks::traits::EvaluationTask;
6use chrono::{DateTime, Utc};
7use scouter_types::genai::traits::ProfileExt;
8use scouter_types::genai::{
9 AssertionResult, ExecutionPlan, GenAIEvalProfile, GenAIEvalSet, TraceAssertionTask,
10};
11use scouter_types::sql::TraceSpan;
12use scouter_types::{Assertion, GenAIEvalRecord};
13use serde_json::Value;
14use std::collections::HashMap;
15use std::sync::Arc;
16use tokio::sync::RwLock;
17use tokio::task::JoinSet;
18use tracing::{debug, error, instrument};
19
20#[derive(Debug, Clone)]
21struct ExecutionContext {
22 base_context: Arc<Value>,
23 assertion_store: Arc<RwLock<AssertionResultStore>>,
24 llm_response_store: Arc<RwLock<LLMResponseStore>>,
25 task_registry: Arc<RwLock<TaskRegistry>>,
26 task_stages: HashMap<String, i32>,
27}
28
29impl ExecutionContext {
30 fn new(base_context: Value, registry: TaskRegistry, execution_plan: &ExecutionPlan) -> Self {
31 debug!("Creating ExecutionContext");
32 Self {
33 base_context: Arc::new(base_context),
34 assertion_store: Arc::new(RwLock::new(AssertionResultStore::new())),
35 llm_response_store: Arc::new(RwLock::new(LLMResponseStore::new())),
36 task_registry: Arc::new(RwLock::new(registry)),
37 task_stages: Self::build_task_stages(execution_plan),
38 }
39 }
40
41 fn build_task_stages(execution_plan: &ExecutionPlan) -> HashMap<String, i32> {
42 execution_plan
43 .nodes
44 .iter()
45 .map(|(id, node)| (id.clone(), node.stage as i32))
46 .collect()
47 }
48
49 async fn build_scoped_context(&self, depends_on: &[String]) -> Value {
50 if depends_on.is_empty() {
51 return self.base_context.as_ref().clone();
52 }
53
54 let mut scoped_context = self.build_context_map(&self.base_context);
55 let registry = self.task_registry.read().await;
56
57 for dep_id in depends_on {
58 match registry.get_type(dep_id) {
59 Some(TaskType::Assertion) => {
60 let store = self.assertion_store.read().await;
61 if let Some(result) = store.retrieve(dep_id) {
62 scoped_context.insert(dep_id.clone(), result.2.actual.clone());
63 }
64 }
65 Some(TaskType::LLMJudge) => {
66 let store = self.llm_response_store.read().await;
67 if let Some(response) = store.retrieve(dep_id) {
68 scoped_context.insert(dep_id.clone(), response.clone());
69 }
70 }
71
72 Some(TaskType::TraceAssertion) => {
73 let store = self.assertion_store.read().await;
75 if let Some(result) = store.retrieve(dep_id) {
76 scoped_context.insert(dep_id.clone(), result.2.actual.clone());
77 }
78 }
79 None => {}
80 }
81 }
82
83 Value::Object(scoped_context)
84 }
85
86 fn build_context_map(&self, value: &Value) -> serde_json::Map<String, Value> {
87 match value {
88 Value::Object(obj) => obj.clone(),
89 _ => {
90 let mut map = serde_json::Map::new();
91 map.insert("context".to_string(), value.clone());
92 map
93 }
94 }
95 }
96
97 async fn store_assertion(
98 &self,
99 task_id: String,
100 start_time: DateTime<Utc>,
101 end_time: DateTime<Utc>,
102 result: AssertionResult,
103 ) {
104 self.assertion_store
105 .write()
106 .await
107 .store(task_id, start_time, end_time, result);
108 }
109
110 async fn store_llm_response(&self, task_id: String, response: Value) {
111 self.llm_response_store
112 .write()
113 .await
114 .store(task_id, response);
115 }
116}
117
118struct DependencyChecker {
119 context: ExecutionContext,
120}
121
122impl DependencyChecker {
123 fn new(context: ExecutionContext) -> Self {
124 Self { context }
125 }
126
127 async fn check_dependencies_satisfied(&self, task_id: &str) -> Option<bool> {
128 debug!("Checking dependencies for task: {}", task_id);
129 let dependencies = {
130 let registry = self.context.task_registry.read().await;
131 match registry.get_dependencies(task_id) {
132 Some(deps) => deps,
133 None => {
134 debug!("Task '{}' has no dependencies, ready to execute", task_id);
136 return Some(true);
137 }
138 }
139 };
140
141 debug!("Task '{}' has dependencies: {:?}", task_id, dependencies);
142
143 let dep_metadata = {
144 let registry = self.context.task_registry.read().await;
145 dependencies
146 .iter()
147 .map(|dep_id| {
148 (
149 dep_id.clone(),
150 registry.is_conditional(dep_id),
151 registry.is_skipped(dep_id),
152 )
153 })
154 .collect::<Vec<_>>()
155 };
156
157 for (dep_id, is_conditional, is_skipped) in dep_metadata {
158 debug!(
159 "Checking dependency '{}' for task '{}': conditional={}, skipped={}",
160 dep_id, task_id, is_conditional, is_skipped
161 );
162 if is_skipped {
163 self.mark_skipped(task_id).await;
164 return Some(false);
165 }
166
167 let completed = self.check_task_completed(&dep_id).await;
168 if !completed {
169 if is_conditional {
170 self.mark_skipped(task_id).await;
171 return Some(false);
172 }
173 return None;
174 }
175
176 if is_conditional && !self.check_assertion_passed(&dep_id).await? {
177 self.mark_skipped(task_id).await;
178 return Some(false);
179 }
180 }
181
182 Some(true)
183 }
184
185 async fn check_task_completed(&self, task_id: &str) -> bool {
186 let registry = self.context.task_registry.read().await;
187 match registry.get_type(task_id) {
188 Some(TaskType::Assertion) => self
189 .context
190 .assertion_store
191 .read()
192 .await
193 .retrieve(task_id)
194 .is_some(),
195 Some(TaskType::LLMJudge) => self
196 .context
197 .llm_response_store
198 .read()
199 .await
200 .retrieve(task_id)
201 .is_some(),
202 Some(TaskType::TraceAssertion) => self
203 .context
204 .assertion_store
205 .read()
206 .await
207 .retrieve(task_id)
208 .is_some(),
209 None => false,
210 }
211 }
212
213 async fn check_assertion_passed(&self, task_id: &str) -> Option<bool> {
214 self.context
215 .assertion_store
216 .read()
217 .await
218 .retrieve(task_id)
219 .map(|res| res.2.passed)
220 }
221
222 async fn mark_skipped(&self, task_id: &str) {
223 self.context
224 .task_registry
225 .write()
226 .await
227 .mark_skipped(task_id.to_string());
228 }
229
230 async fn filter_executable_tasks<'a>(&self, task_ids: &'a [String]) -> Vec<&'a str> {
231 debug!("Filtering executable tasks from: {:?}", task_ids);
232 let mut executable = Vec::with_capacity(task_ids.len());
233
234 for task_id in task_ids {
235 if let Some(true) = self.check_dependencies_satisfied(task_id).await {
236 executable.push(task_id.as_str());
237 }
238 }
239
240 executable
241 }
242}
243
244struct TaskExecutor {
245 context: ExecutionContext,
246 profile: Arc<GenAIEvalProfile>,
247 context_builder: TraceContextBuilder,
248}
249
250impl TaskExecutor {
251 fn new(
252 context: ExecutionContext,
253 profile: Arc<GenAIEvalProfile>,
254 spans: Arc<Vec<TraceSpan>>,
255 ) -> Self {
256 debug!("Creating TaskExecutor");
257 let context_builder = TraceContextBuilder::new(spans);
258 Self {
259 context,
260 profile,
261 context_builder,
262 }
263 }
264
265 #[instrument(skip_all)]
266 async fn execute_level(&self, task_ids: &[String]) -> Result<(), EvaluationError> {
267 let checker = DependencyChecker::new(self.context.clone());
268 let executable_tasks = checker.filter_executable_tasks(task_ids).await;
269
270 debug!("Executable tasks for level: {:?}", executable_tasks);
271
272 if executable_tasks.is_empty() {
273 return Ok(());
274 }
275
276 let (assertions, judges, traces_assertions) = self.partition_tasks(executable_tasks).await;
277
278 debug!(
279 "Executing level with {} assertions, {} LLM judges, and {} trace assertions",
280 assertions.len(),
281 judges.len(),
282 traces_assertions.len()
283 );
284
285 let _result = tokio::try_join!(
286 self.execute_assertions(&assertions),
287 self.execute_llm_judges(&judges),
288 self.execute_trace_assertions(&traces_assertions)
289 )?;
290
291 Ok(())
292 }
293
294 async fn partition_tasks<'a>(
295 &self,
296 task_ids: Vec<&'a str>,
297 ) -> (Vec<&'a str>, Vec<&'a str>, Vec<&'a str>) {
298 let registry = self.context.task_registry.read().await;
299 let mut assertions = Vec::new();
300 let mut traces_assertions = Vec::new();
301 let mut judges = Vec::new();
302
303 for id in task_ids {
304 match registry.get_type(id) {
305 Some(TaskType::Assertion) => assertions.push(id),
306 Some(TaskType::LLMJudge) => judges.push(id),
307 Some(TaskType::TraceAssertion) => traces_assertions.push(id),
308 None => continue,
309 }
310 }
311
312 (assertions, judges, traces_assertions)
313 }
314
315 async fn execute_assertions(&self, task_ids: &[&str]) -> Result<(), EvaluationError> {
316 debug!("Executing assertion tasks: {:?}", task_ids);
317 if task_ids.is_empty() {
318 return Ok(());
319 }
320
321 let mut join_set = JoinSet::new();
322
323 for &task_id in task_ids {
324 let task_id = task_id.to_string();
325 let context = self.context.clone();
326 let profile = self.profile.clone();
327
328 join_set.spawn(async move {
329 Self::execute_assertion_task(&task_id, &context, &profile).await
330 });
331 }
332
333 while let Some(result) = join_set.join_next().await {
334 result.map_err(|e| {
335 EvaluationError::GenAIEvaluatorError(format!("Task join error: {}", e))
336 })??;
337 }
338
339 Ok(())
340 }
341
342 async fn execute_trace_assertions(&self, task_ids: &[&str]) -> Result<(), EvaluationError> {
343 debug!("Executing trace assertion tasks: {:?}", task_ids);
344 if task_ids.is_empty() {
345 return Ok(());
346 }
347 let tasks: Vec<TraceAssertionTask> = task_ids
348 .iter()
349 .filter_map(|&task_id| self.profile.get_trace_assertion_by_id(task_id))
350 .cloned()
351 .collect();
352
353 debug!("Executing {} trace assertion tasks", tasks.len());
354
355 let results = execute_trace_assertions(&self.context_builder, &tasks).inspect_err(|e| {
356 error!("Failed to execute trace assertions: {:?}", e);
357 })?;
358
359 for (task_id, result) in results.results {
360 let start_time = Utc::now(); let end_time = Utc::now();
362
363 self.context
364 .store_assertion(task_id, start_time, end_time, result)
365 .await;
366 }
367
368 Ok(())
369 }
370
371 async fn execute_llm_judges(&self, task_ids: &[&str]) -> Result<(), EvaluationError> {
372 debug!("Executing LLM judge tasks: {:?}", task_ids);
373 if task_ids.is_empty() {
374 return Ok(());
375 }
376
377 let mut join_set = JoinSet::new();
378
379 for &task_id in task_ids {
380 let task_id = task_id.to_string();
381 let context = self.context.clone();
382 let profile = self.profile.clone();
383
384 join_set.spawn(async move {
385 let result = Self::execute_llm_judge_task(&task_id, &context, &profile).await;
386 result
387 });
388 }
389
390 let mut results = HashMap::with_capacity(task_ids.len());
391 while let Some(result) = join_set.join_next().await {
392 let (judge_id, start_time, response) = result.map_err(|e| {
393 EvaluationError::GenAIEvaluatorError(format!("Task join error: {}", e))
394 })??;
395 results.insert(judge_id, (start_time, response));
396 }
397
398 self.process_llm_judge_results(results).await?;
399 Ok(())
400 }
401
402 #[instrument(skip_all, fields(task_id = %task_id))]
403 async fn execute_assertion_task(
404 task_id: &str,
405 context: &ExecutionContext,
406 profile: &GenAIEvalProfile,
407 ) -> Result<(), EvaluationError> {
408 let start_time = Utc::now();
409
410 let task = profile
411 .get_assertion_by_id(task_id)
412 .ok_or_else(|| EvaluationError::TaskNotFound(task_id.to_string()))?;
413
414 let scoped_context = context.build_scoped_context(&task.depends_on).await;
415 let result = task.execute(&scoped_context)?;
416
417 let end_time = Utc::now();
418 context
419 .store_assertion(task_id.to_string(), start_time, end_time, result)
420 .await;
421 Ok(())
422 }
423
424 #[instrument(skip_all, fields(task_id = %task_id))]
425 async fn execute_llm_judge_task(
426 task_id: &str,
427 context: &ExecutionContext,
428 profile: &GenAIEvalProfile,
429 ) -> Result<(String, DateTime<Utc>, serde_json::Value), EvaluationError> {
430 debug!("Starting LLM judge task: {}", task_id);
431 let start_time = Utc::now();
432 let judge = profile
433 .get_llm_judge_by_id(task_id)
434 .ok_or_else(|| EvaluationError::TaskNotFound(task_id.to_string()))?;
435
436 debug!("Building scoped context for: {}", task_id);
437 let scoped_context = context.build_scoped_context(&judge.depends_on).await;
438
439 let workflow = profile.workflow.as_ref().ok_or_else(|| {
440 EvaluationError::GenAIEvaluatorError("No workflow defined".to_string())
441 })?;
442
443 debug!("Executing workflow task: {}", task_id);
444
445 let response = workflow
447 .execute_task(task_id, &scoped_context)
448 .await
449 .inspect_err(|e| error!("LLM task {} failed: {:?}", task_id, e))?;
450
451 debug!("Successfully completed LLM judge task: {}", task_id);
452 Ok((task_id.to_string(), start_time, response))
453 }
454
455 async fn process_llm_judge_results(
456 &self,
457 results: HashMap<String, (DateTime<Utc>, Value)>,
458 ) -> Result<(), EvaluationError> {
459 for (task_id, (start_time, response)) in results {
460 if let Some(task) = self.profile.get_llm_judge_by_id(&task_id) {
461 let assertion_result = task.execute(&response)?;
462
463 self.context
464 .store_llm_response(task_id.clone(), response)
465 .await;
466
467 self.context
468 .store_assertion(task_id, start_time, Utc::now(), assertion_result)
469 .await;
470 }
471 }
472 Ok(())
473 }
474}
475
476struct ResultCollector {
477 context: ExecutionContext,
478}
479
480impl ResultCollector {
481 fn new(context: ExecutionContext) -> Self {
482 Self { context }
483 }
484
485 async fn build_eval_set(
486 &self,
487 record: &GenAIEvalRecord,
488 profile: &GenAIEvalProfile,
489 duration_ms: i64,
490 execution_plan: ExecutionPlan,
491 ) -> GenAIEvalSet {
492 let mut passed_count = 0;
493 let mut failed_count = 0;
494 let mut records = Vec::new();
495
496 let assert_store = self.context.assertion_store.read().await;
497
498 for assertion in &profile.tasks.assertion {
499 if let Some((start_time, end_time, result)) = assert_store.retrieve(&assertion.id) {
500 if !assertion.condition {
501 if result.passed {
502 passed_count += 1;
503 } else {
504 failed_count += 1;
505 }
506 }
507
508 let stage = *self.context.task_stages.get(&assertion.id).unwrap_or(&-1);
509
510 records.push(scouter_types::GenAIEvalTaskResult {
511 created_at: chrono::Utc::now(),
512 start_time,
513 end_time,
514 record_uid: record.uid.clone(),
515 entity_id: record.entity_id,
516 task_id: assertion.id.clone(),
517 task_type: assertion.task_type.clone(),
518 passed: result.passed,
519 value: result.to_metric_value(),
520 assertion: Assertion::FieldPath(assertion.context_path.clone()),
521 expected: result.expected.clone(),
522 actual: result.actual.clone(),
523 message: result.message.clone(),
524 operator: assertion.operator.clone(),
525 entity_uid: String::new(),
526 condition: assertion.condition,
527 stage,
528 });
529 }
530 }
531
532 for judge in &profile.tasks.judge {
533 if let Some((start_time, end_time, result)) = assert_store.retrieve(&judge.id) {
534 if !judge.condition {
535 if result.passed {
536 passed_count += 1;
537 } else {
538 failed_count += 1;
539 }
540 }
541
542 let stage = *self.context.task_stages.get(&judge.id).unwrap_or(&-1);
543
544 records.push(scouter_types::GenAIEvalTaskResult {
545 created_at: chrono::Utc::now(),
546 start_time,
547 end_time,
548 record_uid: record.uid.clone(),
549 entity_id: record.entity_id,
550 task_id: judge.id.clone(),
551 task_type: judge.task_type.clone(),
552 passed: result.passed,
553 value: result.to_metric_value(),
554 assertion: Assertion::FieldPath(judge.context_path.clone()),
555 expected: judge.expected_value.clone(),
556 actual: result.actual.clone(),
557 message: result.message.clone(),
558 operator: judge.operator.clone(),
559 entity_uid: String::new(),
560 condition: judge.condition,
561 stage,
562 });
563 }
564 }
565
566 for trace_assertion in &profile.tasks.trace {
567 if let Some((start_time, end_time, result)) = assert_store.retrieve(&trace_assertion.id)
568 {
569 if !trace_assertion.condition {
570 if result.passed {
571 passed_count += 1;
572 } else {
573 failed_count += 1;
574 }
575 }
576
577 let stage = *self
578 .context
579 .task_stages
580 .get(&trace_assertion.id)
581 .unwrap_or(&-1);
582
583 records.push(scouter_types::GenAIEvalTaskResult {
584 created_at: chrono::Utc::now(),
585 start_time,
586 end_time,
587 record_uid: record.uid.clone(),
588 entity_id: record.entity_id,
589 task_id: trace_assertion.id.clone(),
590 task_type: trace_assertion.task_type.clone(),
591 passed: result.passed,
592 value: result.to_metric_value(),
593 assertion: Assertion::TraceAssertion(trace_assertion.assertion.clone()),
594 expected: result.expected.clone(),
595 actual: result.actual.clone(),
596 message: result.message.clone(),
597 operator: trace_assertion.operator.clone(),
598 entity_uid: String::new(),
599 condition: trace_assertion.condition,
600 stage,
601 });
602 }
603 }
604
605 let workflow_record = scouter_types::GenAIEvalWorkflowResult {
606 created_at: chrono::Utc::now(),
607 id: record.id,
608 entity_id: record.entity_id,
609 record_uid: record.uid.clone(),
610 total_tasks: passed_count + failed_count,
611 passed_tasks: passed_count,
612 failed_tasks: failed_count,
613 pass_rate: if passed_count + failed_count == 0 {
614 0.0
615 } else {
616 passed_count as f64 / (passed_count + failed_count) as f64
617 },
618 duration_ms,
619 entity_uid: String::new(),
620 execution_plan,
621 };
622
623 GenAIEvalSet::new(records, workflow_record)
624 }
625}
626
627pub struct GenAIEvaluator;
628
629impl GenAIEvaluator {
630 #[instrument(skip_all, fields(record_uid = %record.uid))]
631 pub async fn process_event_record(
632 record: &GenAIEvalRecord,
633 profile: Arc<GenAIEvalProfile>,
634 spans: Arc<Vec<TraceSpan>>,
635 ) -> Result<GenAIEvalSet, EvaluationError> {
636 let begin = chrono::Utc::now();
637
638 let mut registry = TaskRegistry::new();
639 Self::register_tasks(&mut registry, &profile);
640
641 let execution_plan = profile.get_execution_plan()?;
642
643 let context = ExecutionContext::new(record.context.clone(), registry, &execution_plan);
644 let executor = TaskExecutor::new(context.clone(), profile.clone(), spans);
645
646 debug!(
647 "Starting evaluation for record: {} with {} stages",
648 record.uid,
649 execution_plan.stages.len()
650 );
651
652 for (stage_idx, stage_tasks) in execution_plan.stages.iter().enumerate() {
653 debug!(
654 "Executing stage {} with {} tasks",
655 stage_idx,
656 stage_tasks.len()
657 );
658 executor
659 .execute_level(stage_tasks)
660 .await
661 .inspect_err(|e| error!("Failed to execute stage {}: {:?}", stage_idx, e))?;
662 }
663
664 let end = chrono::Utc::now();
665 let duration_ms = (end - begin).num_milliseconds();
666
667 let collector = ResultCollector::new(context);
668 let eval_set = collector
669 .build_eval_set(record, &profile, duration_ms, execution_plan)
670 .await;
671
672 Ok(eval_set)
673 }
674
675 fn register_tasks(registry: &mut TaskRegistry, profile: &GenAIEvalProfile) {
676 for task in &profile.tasks.assertion {
677 registry.register(task.id.clone(), TaskType::Assertion, task.condition);
678 if !task.depends_on.is_empty() {
679 registry.register_dependencies(task.id.clone(), task.depends_on.clone());
680 }
681 }
682
683 for task in &profile.tasks.judge {
684 registry.register(task.id.clone(), TaskType::LLMJudge, task.condition);
685 if !task.depends_on.is_empty() {
686 registry.register_dependencies(task.id.clone(), task.depends_on.clone());
687 }
688 }
689
690 for task in &profile.tasks.trace {
691 registry.register(task.id.clone(), TaskType::TraceAssertion, task.condition);
692 if !task.depends_on.is_empty() {
693 registry.register_dependencies(task.id.clone(), task.depends_on.clone());
694 }
695 }
696 }
697}
698
699#[cfg(test)]
700mod tests {
701
702 use chrono::Utc;
703 use potato_head::mock::{create_score_prompt, LLMTestServer};
704 use scouter_mocks::{
705 create_multi_service_trace, create_nested_trace, create_sequence_pattern_trace,
706 create_simple_trace, create_trace_with_attributes, create_trace_with_errors, init_tracing,
707 };
708 use scouter_types::genai::{
709 AggregationType, SpanFilter, SpanStatus, TraceAssertion, TraceAssertionTask,
710 };
711 use scouter_types::genai::{
712 AssertionTask, ComparisonOperator, GenAIAlertConfig, GenAIEvalConfig, GenAIEvalProfile,
713 LLMJudgeTask,
714 };
715 use scouter_types::genai::{EvaluationTaskType, EvaluationTasks};
716 use scouter_types::GenAIEvalRecord;
717 use serde_json::Value;
718 use std::sync::Arc;
719
720 use crate::evaluate::GenAIEvaluator;
721
722 async fn create_assert_judge_profile() -> GenAIEvalProfile {
723 let prompt = create_score_prompt(Some(vec!["input".to_string()]));
724
725 let assertion_level_1 = AssertionTask {
726 id: "input_check".to_string(),
727 context_path: Some("input.foo".to_string()),
728 operator: ComparisonOperator::Equals,
729 expected_value: Value::String("bar".to_string()),
730 description: Some("Check if input.foo is bar".to_string()),
731 task_type: EvaluationTaskType::Assertion,
732 depends_on: vec![],
733 result: None,
734 condition: false,
735 item_context_path: None,
736 };
737
738 let judge_task_level_1 = LLMJudgeTask::new_rs(
739 "query_relevance",
740 prompt.clone(),
741 Value::Number(1.into()),
742 Some("score".to_string()),
743 ComparisonOperator::GreaterThanOrEqual,
744 None,
745 None,
746 None,
747 None,
748 );
749
750 let assert_query_score = AssertionTask {
751 id: "assert_score".to_string(),
752 context_path: Some("query_relevance.score".to_string()),
753 operator: ComparisonOperator::IsNumeric,
754 expected_value: Value::Bool(true),
755 depends_on: vec!["query_relevance".to_string()],
756 task_type: EvaluationTaskType::Assertion,
757 description: Some("Check that score is numeric".to_string()),
758 result: None,
759 condition: false,
760 item_context_path: None,
761 };
762
763 let assert_query_reason = AssertionTask {
764 id: "assert_reason".to_string(),
765 context_path: Some("query_relevance.reason".to_string()),
766 operator: ComparisonOperator::IsString,
767 expected_value: Value::Bool(true),
768 depends_on: vec!["query_relevance".to_string()],
769 task_type: EvaluationTaskType::Assertion,
770 description: Some("Check that reason is alphabetic".to_string()),
771 result: None,
772 condition: false,
773 item_context_path: None,
774 };
775
776 let tasks = EvaluationTasks::new()
777 .add_task(assertion_level_1)
778 .add_task(judge_task_level_1)
779 .add_task(assert_query_score)
780 .add_task(assert_query_reason)
781 .build();
782
783 let alert_config = GenAIAlertConfig::default();
784
785 let drift_config =
786 GenAIEvalConfig::new("scouter", "ML", "0.1.0", 1.0, alert_config, None).unwrap();
787
788 GenAIEvalProfile::new(drift_config, tasks).await.unwrap()
789 }
790
791 async fn create_assert_profile() -> GenAIEvalProfile {
792 let assert1 = AssertionTask {
793 id: "input_foo_check".to_string(),
794 context_path: Some("input.foo".to_string()),
795 operator: ComparisonOperator::Equals,
796 expected_value: Value::String("bar".to_string()),
797 description: Some("Check if input.foo is bar".to_string()),
798 task_type: EvaluationTaskType::Assertion,
799 depends_on: vec![],
800 result: None,
801 condition: false,
802 item_context_path: None,
803 };
804 let assert2 = AssertionTask {
805 id: "input_bar_check".to_string(),
806 context_path: Some("input.bar".to_string()),
807 operator: ComparisonOperator::IsNumeric,
808 expected_value: Value::Bool(true),
809 depends_on: vec![],
810 task_type: EvaluationTaskType::Assertion,
811 description: Some("Check that bar is numeric".to_string()),
812 result: None,
813 condition: false,
814 item_context_path: None,
815 };
816
817 let assert3 = AssertionTask {
818 id: "input_baz_check".to_string(),
819 context_path: Some("input.baz".to_string()),
820 operator: ComparisonOperator::HasLengthEqual,
821 expected_value: Value::Number(3.into()),
822 depends_on: vec![],
823 task_type: EvaluationTaskType::Assertion,
824 description: Some("Check that baz has length 3".to_string()),
825 result: None,
826 condition: false,
827 item_context_path: None,
828 };
829
830 let tasks = EvaluationTasks::new()
831 .add_task(assert1)
832 .add_task(assert2)
833 .add_task(assert3)
834 .build();
835
836 let alert_config = GenAIAlertConfig::default();
837
838 let drift_config =
839 GenAIEvalConfig::new("scouter", "ML", "0.1.0", 1.0, alert_config, None).unwrap();
840
841 GenAIEvalProfile::new(drift_config, tasks).await.unwrap()
842 }
843
844 async fn create_trace_profile_simple() -> GenAIEvalProfile {
845 let trace_task = TraceAssertionTask {
846 id: "check_span_sequence".to_string(),
847 assertion: TraceAssertion::SpanSequence {
848 span_names: vec![
849 "root".to_string(),
850 "child_1".to_string(),
851 "child_2".to_string(),
852 ],
853 },
854 operator: ComparisonOperator::Equals,
855 expected_value: Value::Bool(true),
856 description: Some("Verify span execution order".to_string()),
857 task_type: EvaluationTaskType::TraceAssertion,
858 depends_on: vec![],
859 condition: false,
860 result: None,
861 };
862
863 let tasks = EvaluationTasks::new().add_task(trace_task).build();
864
865 let alert_config = GenAIAlertConfig::default();
866 let drift_config =
867 GenAIEvalConfig::new("scouter", "trace_test", "0.1.0", 1.0, alert_config, None)
868 .unwrap();
869
870 GenAIEvalProfile::new(drift_config, tasks).await.unwrap()
871 }
872
873 async fn create_trace_profile_with_filters() -> GenAIEvalProfile {
874 let span_count_task = TraceAssertionTask {
875 id: "count_error_spans".to_string(),
876 assertion: TraceAssertion::SpanCount {
877 filter: SpanFilter::WithStatus {
878 status: SpanStatus::Error,
879 },
880 },
881 operator: ComparisonOperator::Equals,
882 expected_value: Value::Number(1.into()),
883 description: Some("Count spans with error status".to_string()),
884 task_type: EvaluationTaskType::TraceAssertion,
885 depends_on: vec![],
886 condition: false,
887 result: None,
888 };
889
890 let span_exists_task = TraceAssertionTask {
891 id: "check_recovery_span".to_string(),
892 assertion: TraceAssertion::SpanExists {
893 filter: SpanFilter::ByName {
894 name: "recovery".to_string(),
895 },
896 },
897 operator: ComparisonOperator::Equals,
898 expected_value: Value::Bool(true),
899 description: Some("Verify recovery span exists".to_string()),
900 task_type: EvaluationTaskType::TraceAssertion,
901 depends_on: vec![],
902 condition: false,
903 result: None,
904 };
905
906 let tasks = EvaluationTasks::new()
907 .add_task(span_count_task)
908 .add_task(span_exists_task)
909 .build();
910
911 let alert_config = GenAIAlertConfig::default();
912 let drift_config =
913 GenAIEvalConfig::new("scouter", "trace_test", "0.1.0", 1.0, alert_config, None)
914 .unwrap();
915
916 GenAIEvalProfile::new(drift_config, tasks).await.unwrap()
917 }
918
919 async fn create_trace_profile_with_attributes() -> GenAIEvalProfile {
920 let attribute_task = TraceAssertionTask {
921 id: "check_model_name".to_string(),
922 assertion: TraceAssertion::SpanAttribute {
923 filter: SpanFilter::ByName {
924 name: "api_call".to_string(),
925 },
926 attribute_key: "model".to_string(),
927 },
928 operator: ComparisonOperator::Equals,
929 expected_value: Value::String("gpt-4".to_string()),
930 description: Some("Verify model attribute".to_string()),
931 task_type: EvaluationTaskType::TraceAssertion,
932 depends_on: vec![],
933 condition: false,
934 result: None,
935 };
936
937 let aggregation_task = TraceAssertionTask {
938 id: "sum_token_output".to_string(),
939 assertion: TraceAssertion::SpanAggregation {
940 filter: SpanFilter::ByName {
941 name: "api_call".to_string(),
942 },
943 attribute_key: "tokens.output".to_string(),
944 aggregation: AggregationType::Sum,
945 },
946 operator: ComparisonOperator::Equals,
947 expected_value: Value::Number(300.into()),
948 description: Some("Sum output tokens".to_string()),
949 task_type: EvaluationTaskType::TraceAssertion,
950 depends_on: vec![],
951 condition: false,
952 result: None,
953 };
954
955 let tasks = EvaluationTasks::new()
956 .add_task(attribute_task)
957 .add_task(aggregation_task)
958 .build();
959
960 let alert_config = GenAIAlertConfig::default();
961 let drift_config =
962 GenAIEvalConfig::new("scouter", "trace_test", "0.1.0", 1.0, alert_config, None)
963 .unwrap();
964
965 GenAIEvalProfile::new(drift_config, tasks).await.unwrap()
966 }
967
968 async fn create_trace_profile_complex() -> GenAIEvalProfile {
969 let sequence_count_task = TraceAssertionTask {
970 id: "count_tool_agent_sequence".to_string(),
971 assertion: TraceAssertion::SpanCount {
972 filter: SpanFilter::Sequence {
973 names: vec!["call_tool".to_string(), "run_agent".to_string()],
974 },
975 },
976 operator: ComparisonOperator::Equals,
977 expected_value: Value::Number(2.into()),
978 description: Some("Count tool->agent sequences".to_string()),
979 task_type: EvaluationTaskType::TraceAssertion,
980 depends_on: vec![],
981 condition: false,
982 result: None,
983 };
984
985 let trace_duration_task = TraceAssertionTask {
986 id: "check_trace_duration".to_string(),
987 assertion: TraceAssertion::TraceDuration {},
988 operator: ComparisonOperator::LessThanOrEqual,
989 expected_value: Value::Number(1000.into()),
990 description: Some("Verify trace completes within 1s".to_string()),
991 task_type: EvaluationTaskType::TraceAssertion,
992 depends_on: vec![],
993 condition: false,
994 result: None,
995 };
996
997 let service_count_task = TraceAssertionTask {
998 id: "check_service_count".to_string(),
999 assertion: TraceAssertion::TraceServiceCount {},
1000 operator: ComparisonOperator::Equals,
1001 expected_value: Value::Number(1.into()),
1002 description: Some("Verify single service".to_string()),
1003 task_type: EvaluationTaskType::TraceAssertion,
1004 depends_on: vec![],
1005 condition: false,
1006 result: None,
1007 };
1008
1009 let tasks = EvaluationTasks::new()
1010 .add_task(sequence_count_task)
1011 .add_task(trace_duration_task)
1012 .add_task(service_count_task)
1013 .build();
1014
1015 let alert_config = GenAIAlertConfig::default();
1016 let drift_config =
1017 GenAIEvalConfig::new("scouter", "trace_test", "0.1.0", 1.0, alert_config, None)
1018 .unwrap();
1019
1020 GenAIEvalProfile::new(drift_config, tasks).await.unwrap()
1021 }
1022
1023 async fn create_trace_profile_with_dependencies() -> GenAIEvalProfile {
1024 let error_check = TraceAssertionTask {
1025 id: "check_has_errors".to_string(),
1026 assertion: TraceAssertion::TraceErrorCount {},
1027 operator: ComparisonOperator::GreaterThan,
1028 expected_value: Value::Number(0.into()),
1029 description: Some("Check if trace has errors".to_string()),
1030 task_type: EvaluationTaskType::TraceAssertion,
1031 depends_on: vec![],
1032 condition: true,
1033 result: None,
1034 };
1035
1036 let recovery_check = TraceAssertionTask {
1037 id: "check_recovery_exists".to_string(),
1038 assertion: TraceAssertion::SpanExists {
1039 filter: SpanFilter::ByName {
1040 name: "recovery".to_string(),
1041 },
1042 },
1043 operator: ComparisonOperator::Equals,
1044 expected_value: Value::Bool(true),
1045 description: Some("Verify recovery span exists when errors present".to_string()),
1046 task_type: EvaluationTaskType::TraceAssertion,
1047 depends_on: vec!["check_has_errors".to_string()],
1048 condition: false,
1049 result: None,
1050 };
1051
1052 let tasks = EvaluationTasks::new()
1053 .add_task(error_check)
1054 .add_task(recovery_check)
1055 .build();
1056
1057 let alert_config = GenAIAlertConfig::default();
1058 let drift_config =
1059 GenAIEvalConfig::new("scouter", "trace_test", "0.1.0", 1.0, alert_config, None)
1060 .unwrap();
1061
1062 GenAIEvalProfile::new(drift_config, tasks).await.unwrap()
1063 }
1064
1065 #[test]
1066 fn test_evaluator_assert_judge_all_pass() {
1067 let mut mock = LLMTestServer::new();
1068 mock.start_server().unwrap();
1069 let runtime = tokio::runtime::Runtime::new().unwrap();
1070 let profile = runtime.block_on(async { create_assert_judge_profile().await });
1071
1072 assert!(profile.has_llm_tasks());
1073 assert!(profile.has_assertions());
1074
1075 let context = serde_json::json!({
1076 "input": {
1077 "foo": "bar" }
1078 });
1079
1080 let record = GenAIEvalRecord::new_rs(
1081 context,
1082 Utc::now(),
1083 "UID123".to_string(),
1084 "ENTITY123".to_string(),
1085 None,
1086 None,
1087 );
1088
1089 let result_set = runtime.block_on(async {
1090 GenAIEvaluator::process_event_record(&record, Arc::new(profile), Arc::new(vec![])).await
1091 });
1092
1093 let eval_set = result_set.unwrap();
1094 assert!(eval_set.passed_tasks() == 4);
1095 assert!(eval_set.failed_tasks() == 0);
1096
1097 mock.stop_server().unwrap();
1098 }
1099
1100 #[test]
1101 fn test_evaluator_assert_one_fail() {
1102 let mut mock = LLMTestServer::new();
1103 mock.start_server().unwrap();
1104 let runtime = tokio::runtime::Runtime::new().unwrap();
1105 let profile = runtime.block_on(async { create_assert_profile().await });
1106
1107 assert!(!profile.has_llm_tasks());
1108 assert!(profile.has_assertions());
1109
1110 let context = serde_json::json!({
1112 "input": {
1113 "foo": "bar",
1114 "bar": "not_a_number",
1115 "baz": [1, 2, 3]}
1116 });
1117
1118 let record = GenAIEvalRecord::new_rs(
1119 context,
1120 Utc::now(),
1121 "UID123".to_string(),
1122 "ENTITY123".to_string(),
1123 None,
1124 None,
1125 );
1126
1127 let result_set = runtime.block_on(async {
1128 GenAIEvaluator::process_event_record(&record, Arc::new(profile), Arc::new(vec![])).await
1129 });
1130
1131 let eval_set = result_set.unwrap();
1132 assert!(eval_set.passed_tasks() == 2);
1133 assert!(eval_set.failed_tasks() == 1);
1134
1135 mock.stop_server().unwrap();
1136 }
1137
1138 #[test]
1139 fn test_evaluator_trace_simple_sequence() {
1140 init_tracing();
1141 let runtime = tokio::runtime::Runtime::new().unwrap();
1142 let profile = runtime.block_on(create_trace_profile_simple());
1143 let spans = Arc::new(create_simple_trace());
1144
1145 let context = serde_json::json!({});
1146 let record = GenAIEvalRecord::new_rs(
1147 context,
1148 Utc::now(),
1149 "TRACE_UID_001".to_string(),
1150 "ENTITY_001".to_string(),
1151 None,
1152 None,
1153 );
1154
1155 let result = runtime.block_on(GenAIEvaluator::process_event_record(
1156 &record,
1157 Arc::new(profile),
1158 spans,
1159 ));
1160
1161 let eval_set = result.unwrap();
1162 assert_eq!(eval_set.passed_tasks(), 1);
1163 assert_eq!(eval_set.failed_tasks(), 0);
1164 }
1165
1166 #[test]
1167 fn test_evaluator_trace_error_detection() {
1168 let runtime = tokio::runtime::Runtime::new().unwrap();
1169 let profile = runtime.block_on(create_trace_profile_with_filters());
1170 let spans = Arc::new(create_trace_with_errors());
1171
1172 let context = serde_json::json!({});
1173 let record = GenAIEvalRecord::new_rs(
1174 context,
1175 Utc::now(),
1176 "TRACE_UID_002".to_string(),
1177 "ENTITY_002".to_string(),
1178 None,
1179 None,
1180 );
1181
1182 let result = runtime.block_on(GenAIEvaluator::process_event_record(
1183 &record,
1184 Arc::new(profile),
1185 spans,
1186 ));
1187
1188 let eval_set = result.unwrap();
1189 assert_eq!(eval_set.passed_tasks(), 2);
1190 assert_eq!(eval_set.failed_tasks(), 0);
1191 }
1192
1193 #[test]
1194 fn test_evaluator_trace_attribute_extraction() {
1195 init_tracing();
1196 let runtime = tokio::runtime::Runtime::new().unwrap();
1197 let profile = runtime.block_on(create_trace_profile_with_attributes());
1198 let spans = Arc::new(create_trace_with_attributes());
1199
1200 let context = serde_json::json!({});
1201 let record = GenAIEvalRecord::new_rs(
1202 context,
1203 Utc::now(),
1204 "TRACE_UID_003".to_string(),
1205 "ENTITY_003".to_string(),
1206 None,
1207 None,
1208 );
1209
1210 let result = runtime.block_on(GenAIEvaluator::process_event_record(
1211 &record,
1212 Arc::new(profile),
1213 spans,
1214 ));
1215
1216 let eval_set = result.unwrap();
1217 assert_eq!(eval_set.passed_tasks(), 2);
1218 assert_eq!(eval_set.failed_tasks(), 0);
1219 }
1220
1221 #[test]
1222 fn test_evaluator_trace_sequence_pattern() {
1223 init_tracing();
1224 let runtime = tokio::runtime::Runtime::new().unwrap();
1225 let profile = runtime.block_on(create_trace_profile_complex());
1226 let spans = Arc::new(create_sequence_pattern_trace());
1227
1228 let context = serde_json::json!({});
1229 let record = GenAIEvalRecord::new_rs(
1230 context,
1231 Utc::now(),
1232 "TRACE_UID_004".to_string(),
1233 "ENTITY_004".to_string(),
1234 None,
1235 None,
1236 );
1237
1238 let result = runtime.block_on(GenAIEvaluator::process_event_record(
1239 &record,
1240 Arc::new(profile),
1241 spans,
1242 ));
1243
1244 let eval_set = result.unwrap();
1245 assert_eq!(eval_set.passed_tasks(), 3);
1246 assert_eq!(eval_set.failed_tasks(), 0);
1247 }
1248
1249 #[test]
1250 fn test_evaluator_trace_conditional_dependency() {
1251 let runtime = tokio::runtime::Runtime::new().unwrap();
1252 let profile = runtime.block_on(create_trace_profile_with_dependencies());
1253 let spans = Arc::new(create_trace_with_errors());
1254
1255 let context = serde_json::json!({});
1256 let record = GenAIEvalRecord::new_rs(
1257 context,
1258 Utc::now(),
1259 "TRACE_UID_005".to_string(),
1260 "ENTITY_005".to_string(),
1261 None,
1262 None,
1263 );
1264
1265 let result = runtime.block_on(GenAIEvaluator::process_event_record(
1266 &record,
1267 Arc::new(profile),
1268 spans,
1269 ));
1270
1271 let eval_set = result.unwrap();
1272 assert_eq!(eval_set.passed_tasks(), 1); assert_eq!(eval_set.failed_tasks(), 0);
1274 }
1275
1276 #[test]
1277 fn test_evaluator_trace_multi_service() {
1278 let runtime = tokio::runtime::Runtime::new().unwrap();
1279
1280 let task = TraceAssertionTask {
1281 id: "check_service_count".to_string(),
1282 assertion: TraceAssertion::TraceServiceCount {},
1283 operator: ComparisonOperator::Equals,
1284 expected_value: Value::Number(3.into()),
1285 description: Some("Verify three services".to_string()),
1286 task_type: EvaluationTaskType::TraceAssertion,
1287 depends_on: vec![],
1288 condition: false,
1289 result: None,
1290 };
1291
1292 let tasks = EvaluationTasks::new().add_task(task).build();
1293 let alert_config = GenAIAlertConfig::default();
1294 let drift_config =
1295 GenAIEvalConfig::new("scouter", "trace_test", "0.1.0", 1.0, alert_config, None)
1296 .unwrap();
1297
1298 let profile = runtime
1299 .block_on(GenAIEvalProfile::new(drift_config, tasks))
1300 .unwrap();
1301 let spans = Arc::new(create_multi_service_trace());
1302
1303 let context = serde_json::json!({});
1304 let record = GenAIEvalRecord::new_rs(
1305 context,
1306 Utc::now(),
1307 "TRACE_UID_006".to_string(),
1308 "ENTITY_006".to_string(),
1309 None,
1310 None,
1311 );
1312
1313 let result = runtime.block_on(GenAIEvaluator::process_event_record(
1314 &record,
1315 Arc::new(profile),
1316 spans,
1317 ));
1318
1319 let eval_set = result.unwrap();
1320 assert_eq!(eval_set.passed_tasks(), 1);
1321 assert_eq!(eval_set.failed_tasks(), 0);
1322 }
1323
1324 #[test]
1325 fn test_evaluator_trace_assertion_failure() {
1326 let runtime = tokio::runtime::Runtime::new().unwrap();
1327
1328 let task = TraceAssertionTask {
1329 id: "check_wrong_sequence".to_string(),
1330 assertion: TraceAssertion::SpanSequence {
1331 span_names: vec![
1332 "root".to_string(),
1333 "wrong_child".to_string(),
1334 "child_2".to_string(),
1335 ],
1336 },
1337 operator: ComparisonOperator::Equals,
1338 expected_value: Value::Bool(true),
1339 description: Some("Verify incorrect span order".to_string()),
1340 task_type: EvaluationTaskType::TraceAssertion,
1341 depends_on: vec![],
1342 condition: false,
1343 result: None,
1344 };
1345
1346 let tasks = EvaluationTasks::new().add_task(task).build();
1347 let alert_config = GenAIAlertConfig::default();
1348 let drift_config =
1349 GenAIEvalConfig::new("scouter", "trace_test", "0.1.0", 1.0, alert_config, None)
1350 .unwrap();
1351
1352 let profile = runtime
1353 .block_on(GenAIEvalProfile::new(drift_config, tasks))
1354 .unwrap();
1355 let spans = Arc::new(create_simple_trace());
1356
1357 let context = serde_json::json!({});
1358 let record = GenAIEvalRecord::new_rs(
1359 context,
1360 Utc::now(),
1361 "TRACE_UID_007".to_string(),
1362 "ENTITY_007".to_string(),
1363 None,
1364 None,
1365 );
1366
1367 let result = runtime.block_on(GenAIEvaluator::process_event_record(
1368 &record,
1369 Arc::new(profile),
1370 spans,
1371 ));
1372
1373 let eval_set = result.unwrap();
1374 assert_eq!(eval_set.passed_tasks(), 0);
1375 assert_eq!(eval_set.failed_tasks(), 1);
1376 }
1377
1378 #[test]
1379 fn test_evaluator_trace_mixed_assertions() {
1380 init_tracing();
1381 let runtime = tokio::runtime::Runtime::new().unwrap();
1382
1383 let trace_task = TraceAssertionTask {
1384 id: "check_max_depth".to_string(),
1385 assertion: TraceAssertion::TraceMaxDepth {},
1386 operator: ComparisonOperator::Equals,
1387 expected_value: Value::Number(2.into()),
1388 description: Some("Verify max depth".to_string()),
1389 task_type: EvaluationTaskType::TraceAssertion,
1390 depends_on: vec![],
1391 condition: false,
1392 result: None,
1393 };
1394
1395 let regular_assertion = AssertionTask {
1396 id: "check_context".to_string(),
1397 context_path: Some("metadata.version".to_string()),
1398 operator: ComparisonOperator::Equals,
1399 expected_value: Value::String("1.0.0".to_string()),
1400 description: Some("Verify version".to_string()),
1401 task_type: EvaluationTaskType::Assertion,
1402 depends_on: vec![],
1403 result: None,
1404 condition: false,
1405 item_context_path: None,
1406 };
1407
1408 let tasks = EvaluationTasks::new()
1409 .add_task(trace_task)
1410 .add_task(regular_assertion)
1411 .build();
1412
1413 let alert_config = GenAIAlertConfig::default();
1414 let drift_config =
1415 GenAIEvalConfig::new("scouter", "trace_test", "0.1.0", 1.0, alert_config, None)
1416 .unwrap();
1417
1418 let profile = runtime
1419 .block_on(GenAIEvalProfile::new(drift_config, tasks))
1420 .unwrap();
1421 let spans = Arc::new(create_nested_trace());
1422
1423 let context = serde_json::json!({
1424 "metadata": {
1425 "version": "1.0.0"
1426 }
1427 });
1428
1429 let record = GenAIEvalRecord::new_rs(
1430 context,
1431 Utc::now(),
1432 "TRACE_UID_008".to_string(),
1433 "ENTITY_008".to_string(),
1434 None,
1435 None,
1436 );
1437
1438 let result = runtime.block_on(GenAIEvaluator::process_event_record(
1439 &record,
1440 Arc::new(profile),
1441 spans,
1442 ));
1443
1444 let eval_set = result.unwrap();
1445 assert_eq!(eval_set.passed_tasks(), 2);
1446 assert_eq!(eval_set.failed_tasks(), 0);
1447 }
1448
1449 #[test]
1450 fn test_evaluator_trace_duration_filter() {
1451 init_tracing();
1452 let runtime = tokio::runtime::Runtime::new().unwrap();
1453
1454 let task = TraceAssertionTask {
1455 id: "check_slow_spans".to_string(),
1456 assertion: TraceAssertion::SpanCount {
1457 filter: SpanFilter::WithDuration {
1458 min_ms: Some(100.0),
1459 max_ms: None,
1460 },
1461 },
1462 operator: ComparisonOperator::GreaterThanOrEqual,
1463 expected_value: Value::Number(2.into()),
1464 description: Some("Count spans over 100ms".to_string()),
1465 task_type: EvaluationTaskType::TraceAssertion,
1466 depends_on: vec![],
1467 condition: false,
1468 result: None,
1469 };
1470
1471 let tasks = EvaluationTasks::new().add_task(task).build();
1472 let alert_config = GenAIAlertConfig::default();
1473 let drift_config =
1474 GenAIEvalConfig::new("scouter", "trace_test", "0.1.0", 1.0, alert_config, None)
1475 .unwrap();
1476
1477 let profile = runtime
1478 .block_on(GenAIEvalProfile::new(drift_config, tasks))
1479 .unwrap();
1480 let spans = Arc::new(create_nested_trace());
1481
1482 let context = serde_json::json!({});
1483 let record = GenAIEvalRecord::new_rs(
1484 context,
1485 Utc::now(),
1486 "TRACE_UID_009".to_string(),
1487 "ENTITY_009".to_string(),
1488 None,
1489 None,
1490 );
1491
1492 let result = runtime.block_on(GenAIEvaluator::process_event_record(
1493 &record,
1494 Arc::new(profile),
1495 spans,
1496 ));
1497
1498 let eval_set = result.unwrap();
1499 assert_eq!(eval_set.passed_tasks(), 1);
1500 assert_eq!(eval_set.failed_tasks(), 0);
1501 }
1502}