Skip to main content

scouter_evaluate/evaluate/
evaluator.rs

1use crate::error::EvaluationError;
2use crate::evaluate::store::{AssertionResultStore, LLMResponseStore, TaskRegistry, TaskType};
3use crate::evaluate::trace::TraceContextBuilder;
4use crate::tasks::trace::execute_trace_assertions;
5use crate::tasks::traits::EvaluationTask;
6use chrono::{DateTime, Utc};
7use scouter_types::genai::traits::ProfileExt;
8use scouter_types::genai::{
9    AssertionResult, ExecutionPlan, GenAIEvalProfile, GenAIEvalSet, TraceAssertionTask,
10};
11use scouter_types::sql::TraceSpan;
12use scouter_types::{Assertion, GenAIEvalRecord};
13use serde_json::Value;
14use std::collections::HashMap;
15use std::sync::Arc;
16use tokio::sync::RwLock;
17use tokio::task::JoinSet;
18use tracing::{debug, error, instrument};
19
20#[derive(Debug, Clone)]
21struct ExecutionContext {
22    base_context: Arc<Value>,
23    assertion_store: Arc<RwLock<AssertionResultStore>>,
24    llm_response_store: Arc<RwLock<LLMResponseStore>>,
25    task_registry: Arc<RwLock<TaskRegistry>>,
26    task_stages: HashMap<String, i32>,
27}
28
29impl ExecutionContext {
30    fn new(base_context: Value, registry: TaskRegistry, execution_plan: &ExecutionPlan) -> Self {
31        debug!("Creating ExecutionContext");
32        Self {
33            base_context: Arc::new(base_context),
34            assertion_store: Arc::new(RwLock::new(AssertionResultStore::new())),
35            llm_response_store: Arc::new(RwLock::new(LLMResponseStore::new())),
36            task_registry: Arc::new(RwLock::new(registry)),
37            task_stages: Self::build_task_stages(execution_plan),
38        }
39    }
40
41    fn build_task_stages(execution_plan: &ExecutionPlan) -> HashMap<String, i32> {
42        execution_plan
43            .nodes
44            .iter()
45            .map(|(id, node)| (id.clone(), node.stage as i32))
46            .collect()
47    }
48
49    async fn build_scoped_context(&self, depends_on: &[String]) -> Value {
50        if depends_on.is_empty() {
51            return self.base_context.as_ref().clone();
52        }
53
54        let mut scoped_context = self.build_context_map(&self.base_context);
55        let registry = self.task_registry.read().await;
56
57        for dep_id in depends_on {
58            match registry.get_type(dep_id) {
59                Some(TaskType::Assertion) => {
60                    let store = self.assertion_store.read().await;
61                    if let Some(result) = store.retrieve(dep_id) {
62                        scoped_context.insert(dep_id.clone(), result.2.actual.clone());
63                    }
64                }
65                Some(TaskType::LLMJudge) => {
66                    let store = self.llm_response_store.read().await;
67                    if let Some(response) = store.retrieve(dep_id) {
68                        scoped_context.insert(dep_id.clone(), response.clone());
69                    }
70                }
71
72                Some(TaskType::TraceAssertion) => {
73                    // Trace assertions store their results in the assertion store
74                    let store = self.assertion_store.read().await;
75                    if let Some(result) = store.retrieve(dep_id) {
76                        scoped_context.insert(dep_id.clone(), result.2.actual.clone());
77                    }
78                }
79                None => {}
80            }
81        }
82
83        Value::Object(scoped_context)
84    }
85
86    fn build_context_map(&self, value: &Value) -> serde_json::Map<String, Value> {
87        match value {
88            Value::Object(obj) => obj.clone(),
89            _ => {
90                let mut map = serde_json::Map::new();
91                map.insert("context".to_string(), value.clone());
92                map
93            }
94        }
95    }
96
97    async fn store_assertion(
98        &self,
99        task_id: String,
100        start_time: DateTime<Utc>,
101        end_time: DateTime<Utc>,
102        result: AssertionResult,
103    ) {
104        self.assertion_store
105            .write()
106            .await
107            .store(task_id, start_time, end_time, result);
108    }
109
110    async fn store_llm_response(&self, task_id: String, response: Value) {
111        self.llm_response_store
112            .write()
113            .await
114            .store(task_id, response);
115    }
116}
117
118struct DependencyChecker {
119    context: ExecutionContext,
120}
121
122impl DependencyChecker {
123    fn new(context: ExecutionContext) -> Self {
124        Self { context }
125    }
126
127    async fn check_dependencies_satisfied(&self, task_id: &str) -> Option<bool> {
128        debug!("Checking dependencies for task: {}", task_id);
129        let dependencies = {
130            let registry = self.context.task_registry.read().await;
131            match registry.get_dependencies(task_id) {
132                Some(deps) => deps,
133                None => {
134                    // Task exists but has no dependencies - ready to execute
135                    debug!("Task '{}' has no dependencies, ready to execute", task_id);
136                    return Some(true);
137                }
138            }
139        };
140
141        debug!("Task '{}' has dependencies: {:?}", task_id, dependencies);
142
143        let dep_metadata = {
144            let registry = self.context.task_registry.read().await;
145            dependencies
146                .iter()
147                .map(|dep_id| {
148                    (
149                        dep_id.clone(),
150                        registry.is_conditional(dep_id),
151                        registry.is_skipped(dep_id),
152                    )
153                })
154                .collect::<Vec<_>>()
155        };
156
157        for (dep_id, is_conditional, is_skipped) in dep_metadata {
158            debug!(
159                "Checking dependency '{}' for task '{}': conditional={}, skipped={}",
160                dep_id, task_id, is_conditional, is_skipped
161            );
162            if is_skipped {
163                self.mark_skipped(task_id).await;
164                return Some(false);
165            }
166
167            let completed = self.check_task_completed(&dep_id).await;
168            if !completed {
169                if is_conditional {
170                    self.mark_skipped(task_id).await;
171                    return Some(false);
172                }
173                return None;
174            }
175
176            if is_conditional && !self.check_assertion_passed(&dep_id).await? {
177                self.mark_skipped(task_id).await;
178                return Some(false);
179            }
180        }
181
182        Some(true)
183    }
184
185    async fn check_task_completed(&self, task_id: &str) -> bool {
186        let registry = self.context.task_registry.read().await;
187        match registry.get_type(task_id) {
188            Some(TaskType::Assertion) => self
189                .context
190                .assertion_store
191                .read()
192                .await
193                .retrieve(task_id)
194                .is_some(),
195            Some(TaskType::LLMJudge) => self
196                .context
197                .llm_response_store
198                .read()
199                .await
200                .retrieve(task_id)
201                .is_some(),
202            Some(TaskType::TraceAssertion) => self
203                .context
204                .assertion_store
205                .read()
206                .await
207                .retrieve(task_id)
208                .is_some(),
209            None => false,
210        }
211    }
212
213    async fn check_assertion_passed(&self, task_id: &str) -> Option<bool> {
214        self.context
215            .assertion_store
216            .read()
217            .await
218            .retrieve(task_id)
219            .map(|res| res.2.passed)
220    }
221
222    async fn mark_skipped(&self, task_id: &str) {
223        self.context
224            .task_registry
225            .write()
226            .await
227            .mark_skipped(task_id.to_string());
228    }
229
230    async fn filter_executable_tasks<'a>(&self, task_ids: &'a [String]) -> Vec<&'a str> {
231        debug!("Filtering executable tasks from: {:?}", task_ids);
232        let mut executable = Vec::with_capacity(task_ids.len());
233
234        for task_id in task_ids {
235            if let Some(true) = self.check_dependencies_satisfied(task_id).await {
236                executable.push(task_id.as_str());
237            }
238        }
239
240        executable
241    }
242}
243
244struct TaskExecutor {
245    context: ExecutionContext,
246    profile: Arc<GenAIEvalProfile>,
247    context_builder: TraceContextBuilder,
248}
249
250impl TaskExecutor {
251    fn new(
252        context: ExecutionContext,
253        profile: Arc<GenAIEvalProfile>,
254        spans: Arc<Vec<TraceSpan>>,
255    ) -> Self {
256        debug!("Creating TaskExecutor");
257        let context_builder = TraceContextBuilder::new(spans);
258        Self {
259            context,
260            profile,
261            context_builder,
262        }
263    }
264
265    #[instrument(skip_all)]
266    async fn execute_level(&self, task_ids: &[String]) -> Result<(), EvaluationError> {
267        let checker = DependencyChecker::new(self.context.clone());
268        let executable_tasks = checker.filter_executable_tasks(task_ids).await;
269
270        debug!("Executable tasks for level: {:?}", executable_tasks);
271
272        if executable_tasks.is_empty() {
273            return Ok(());
274        }
275
276        let (assertions, judges, traces_assertions) = self.partition_tasks(executable_tasks).await;
277
278        debug!(
279            "Executing level with {} assertions, {} LLM judges, and {} trace assertions",
280            assertions.len(),
281            judges.len(),
282            traces_assertions.len()
283        );
284
285        let _result = tokio::try_join!(
286            self.execute_assertions(&assertions),
287            self.execute_llm_judges(&judges),
288            self.execute_trace_assertions(&traces_assertions)
289        )?;
290
291        Ok(())
292    }
293
294    async fn partition_tasks<'a>(
295        &self,
296        task_ids: Vec<&'a str>,
297    ) -> (Vec<&'a str>, Vec<&'a str>, Vec<&'a str>) {
298        let registry = self.context.task_registry.read().await;
299        let mut assertions = Vec::new();
300        let mut traces_assertions = Vec::new();
301        let mut judges = Vec::new();
302
303        for id in task_ids {
304            match registry.get_type(id) {
305                Some(TaskType::Assertion) => assertions.push(id),
306                Some(TaskType::LLMJudge) => judges.push(id),
307                Some(TaskType::TraceAssertion) => traces_assertions.push(id),
308                None => continue,
309            }
310        }
311
312        (assertions, judges, traces_assertions)
313    }
314
315    async fn execute_assertions(&self, task_ids: &[&str]) -> Result<(), EvaluationError> {
316        debug!("Executing assertion tasks: {:?}", task_ids);
317        if task_ids.is_empty() {
318            return Ok(());
319        }
320
321        let mut join_set = JoinSet::new();
322
323        for &task_id in task_ids {
324            let task_id = task_id.to_string();
325            let context = self.context.clone();
326            let profile = self.profile.clone();
327
328            join_set.spawn(async move {
329                Self::execute_assertion_task(&task_id, &context, &profile).await
330            });
331        }
332
333        while let Some(result) = join_set.join_next().await {
334            result.map_err(|e| {
335                EvaluationError::GenAIEvaluatorError(format!("Task join error: {}", e))
336            })??;
337        }
338
339        Ok(())
340    }
341
342    async fn execute_trace_assertions(&self, task_ids: &[&str]) -> Result<(), EvaluationError> {
343        debug!("Executing trace assertion tasks: {:?}", task_ids);
344        if task_ids.is_empty() {
345            return Ok(());
346        }
347        let tasks: Vec<TraceAssertionTask> = task_ids
348            .iter()
349            .filter_map(|&task_id| self.profile.get_trace_assertion_by_id(task_id))
350            .cloned()
351            .collect();
352
353        debug!("Executing {} trace assertion tasks", tasks.len());
354
355        let results = execute_trace_assertions(&self.context_builder, &tasks).inspect_err(|e| {
356            error!("Failed to execute trace assertions: {:?}", e);
357        })?;
358
359        for (task_id, result) in results.results {
360            let start_time = Utc::now(); // In a real implementation, track actual start times
361            let end_time = Utc::now();
362
363            self.context
364                .store_assertion(task_id, start_time, end_time, result)
365                .await;
366        }
367
368        Ok(())
369    }
370
371    async fn execute_llm_judges(&self, task_ids: &[&str]) -> Result<(), EvaluationError> {
372        debug!("Executing LLM judge tasks: {:?}", task_ids);
373        if task_ids.is_empty() {
374            return Ok(());
375        }
376
377        let mut join_set = JoinSet::new();
378
379        for &task_id in task_ids {
380            let task_id = task_id.to_string();
381            let context = self.context.clone();
382            let profile = self.profile.clone();
383
384            join_set.spawn(async move {
385                let result = Self::execute_llm_judge_task(&task_id, &context, &profile).await;
386                result
387            });
388        }
389
390        let mut results = HashMap::with_capacity(task_ids.len());
391        while let Some(result) = join_set.join_next().await {
392            let (judge_id, start_time, response) = result.map_err(|e| {
393                EvaluationError::GenAIEvaluatorError(format!("Task join error: {}", e))
394            })??;
395            results.insert(judge_id, (start_time, response));
396        }
397
398        self.process_llm_judge_results(results).await?;
399        Ok(())
400    }
401
402    #[instrument(skip_all, fields(task_id = %task_id))]
403    async fn execute_assertion_task(
404        task_id: &str,
405        context: &ExecutionContext,
406        profile: &GenAIEvalProfile,
407    ) -> Result<(), EvaluationError> {
408        let start_time = Utc::now();
409
410        let task = profile
411            .get_assertion_by_id(task_id)
412            .ok_or_else(|| EvaluationError::TaskNotFound(task_id.to_string()))?;
413
414        let scoped_context = context.build_scoped_context(&task.depends_on).await;
415        let result = task.execute(&scoped_context)?;
416
417        let end_time = Utc::now();
418        context
419            .store_assertion(task_id.to_string(), start_time, end_time, result)
420            .await;
421        Ok(())
422    }
423
424    #[instrument(skip_all, fields(task_id = %task_id))]
425    async fn execute_llm_judge_task(
426        task_id: &str,
427        context: &ExecutionContext,
428        profile: &GenAIEvalProfile,
429    ) -> Result<(String, DateTime<Utc>, serde_json::Value), EvaluationError> {
430        debug!("Starting LLM judge task: {}", task_id);
431        let start_time = Utc::now();
432        let judge = profile
433            .get_llm_judge_by_id(task_id)
434            .ok_or_else(|| EvaluationError::TaskNotFound(task_id.to_string()))?;
435
436        debug!("Building scoped context for: {}", task_id);
437        let scoped_context = context.build_scoped_context(&judge.depends_on).await;
438
439        let workflow = profile.workflow.as_ref().ok_or_else(|| {
440            EvaluationError::GenAIEvaluatorError("No workflow defined".to_string())
441        })?;
442
443        debug!("Executing workflow task: {}", task_id);
444
445        // This is where the actual LLM call happens - ensure it's awaited
446        let response = workflow
447            .execute_task(task_id, &scoped_context)
448            .await
449            .inspect_err(|e| error!("LLM task {} failed: {:?}", task_id, e))?;
450
451        debug!("Successfully completed LLM judge task: {}", task_id);
452        Ok((task_id.to_string(), start_time, response))
453    }
454
455    async fn process_llm_judge_results(
456        &self,
457        results: HashMap<String, (DateTime<Utc>, Value)>,
458    ) -> Result<(), EvaluationError> {
459        for (task_id, (start_time, response)) in results {
460            if let Some(task) = self.profile.get_llm_judge_by_id(&task_id) {
461                let assertion_result = task.execute(&response)?;
462
463                self.context
464                    .store_llm_response(task_id.clone(), response)
465                    .await;
466
467                self.context
468                    .store_assertion(task_id, start_time, Utc::now(), assertion_result)
469                    .await;
470            }
471        }
472        Ok(())
473    }
474}
475
476struct ResultCollector {
477    context: ExecutionContext,
478}
479
480impl ResultCollector {
481    fn new(context: ExecutionContext) -> Self {
482        Self { context }
483    }
484
485    async fn build_eval_set(
486        &self,
487        record: &GenAIEvalRecord,
488        profile: &GenAIEvalProfile,
489        duration_ms: i64,
490        execution_plan: ExecutionPlan,
491    ) -> GenAIEvalSet {
492        let mut passed_count = 0;
493        let mut failed_count = 0;
494        let mut records = Vec::new();
495
496        let assert_store = self.context.assertion_store.read().await;
497
498        for assertion in &profile.tasks.assertion {
499            if let Some((start_time, end_time, result)) = assert_store.retrieve(&assertion.id) {
500                if !assertion.condition {
501                    if result.passed {
502                        passed_count += 1;
503                    } else {
504                        failed_count += 1;
505                    }
506                }
507
508                let stage = *self.context.task_stages.get(&assertion.id).unwrap_or(&-1);
509
510                records.push(scouter_types::GenAIEvalTaskResult {
511                    created_at: chrono::Utc::now(),
512                    start_time,
513                    end_time,
514                    record_uid: record.uid.clone(),
515                    entity_id: record.entity_id,
516                    task_id: assertion.id.clone(),
517                    task_type: assertion.task_type.clone(),
518                    passed: result.passed,
519                    value: result.to_metric_value(),
520                    assertion: Assertion::FieldPath(assertion.context_path.clone()),
521                    expected: result.expected.clone(),
522                    actual: result.actual.clone(),
523                    message: result.message.clone(),
524                    operator: assertion.operator.clone(),
525                    entity_uid: String::new(),
526                    condition: assertion.condition,
527                    stage,
528                });
529            }
530        }
531
532        for judge in &profile.tasks.judge {
533            if let Some((start_time, end_time, result)) = assert_store.retrieve(&judge.id) {
534                if !judge.condition {
535                    if result.passed {
536                        passed_count += 1;
537                    } else {
538                        failed_count += 1;
539                    }
540                }
541
542                let stage = *self.context.task_stages.get(&judge.id).unwrap_or(&-1);
543
544                records.push(scouter_types::GenAIEvalTaskResult {
545                    created_at: chrono::Utc::now(),
546                    start_time,
547                    end_time,
548                    record_uid: record.uid.clone(),
549                    entity_id: record.entity_id,
550                    task_id: judge.id.clone(),
551                    task_type: judge.task_type.clone(),
552                    passed: result.passed,
553                    value: result.to_metric_value(),
554                    assertion: Assertion::FieldPath(judge.context_path.clone()),
555                    expected: judge.expected_value.clone(),
556                    actual: result.actual.clone(),
557                    message: result.message.clone(),
558                    operator: judge.operator.clone(),
559                    entity_uid: String::new(),
560                    condition: judge.condition,
561                    stage,
562                });
563            }
564        }
565
566        for trace_assertion in &profile.tasks.trace {
567            if let Some((start_time, end_time, result)) = assert_store.retrieve(&trace_assertion.id)
568            {
569                if !trace_assertion.condition {
570                    if result.passed {
571                        passed_count += 1;
572                    } else {
573                        failed_count += 1;
574                    }
575                }
576
577                let stage = *self
578                    .context
579                    .task_stages
580                    .get(&trace_assertion.id)
581                    .unwrap_or(&-1);
582
583                records.push(scouter_types::GenAIEvalTaskResult {
584                    created_at: chrono::Utc::now(),
585                    start_time,
586                    end_time,
587                    record_uid: record.uid.clone(),
588                    entity_id: record.entity_id,
589                    task_id: trace_assertion.id.clone(),
590                    task_type: trace_assertion.task_type.clone(),
591                    passed: result.passed,
592                    value: result.to_metric_value(),
593                    assertion: Assertion::TraceAssertion(trace_assertion.assertion.clone()),
594                    expected: result.expected.clone(),
595                    actual: result.actual.clone(),
596                    message: result.message.clone(),
597                    operator: trace_assertion.operator.clone(),
598                    entity_uid: String::new(),
599                    condition: trace_assertion.condition,
600                    stage,
601                });
602            }
603        }
604
605        let workflow_record = scouter_types::GenAIEvalWorkflowResult {
606            created_at: chrono::Utc::now(),
607            id: record.id,
608            entity_id: record.entity_id,
609            record_uid: record.uid.clone(),
610            total_tasks: passed_count + failed_count,
611            passed_tasks: passed_count,
612            failed_tasks: failed_count,
613            pass_rate: if passed_count + failed_count == 0 {
614                0.0
615            } else {
616                passed_count as f64 / (passed_count + failed_count) as f64
617            },
618            duration_ms,
619            entity_uid: String::new(),
620            execution_plan,
621        };
622
623        GenAIEvalSet::new(records, workflow_record)
624    }
625}
626
627pub struct GenAIEvaluator;
628
629impl GenAIEvaluator {
630    #[instrument(skip_all, fields(record_uid = %record.uid))]
631    pub async fn process_event_record(
632        record: &GenAIEvalRecord,
633        profile: Arc<GenAIEvalProfile>,
634        spans: Arc<Vec<TraceSpan>>,
635    ) -> Result<GenAIEvalSet, EvaluationError> {
636        let begin = chrono::Utc::now();
637
638        let mut registry = TaskRegistry::new();
639        Self::register_tasks(&mut registry, &profile);
640
641        let execution_plan = profile.get_execution_plan()?;
642
643        let context = ExecutionContext::new(record.context.clone(), registry, &execution_plan);
644        let executor = TaskExecutor::new(context.clone(), profile.clone(), spans);
645
646        debug!(
647            "Starting evaluation for record: {} with {} stages",
648            record.uid,
649            execution_plan.stages.len()
650        );
651
652        for (stage_idx, stage_tasks) in execution_plan.stages.iter().enumerate() {
653            debug!(
654                "Executing stage {} with {} tasks",
655                stage_idx,
656                stage_tasks.len()
657            );
658            executor
659                .execute_level(stage_tasks)
660                .await
661                .inspect_err(|e| error!("Failed to execute stage {}: {:?}", stage_idx, e))?;
662        }
663
664        let end = chrono::Utc::now();
665        let duration_ms = (end - begin).num_milliseconds();
666
667        let collector = ResultCollector::new(context);
668        let eval_set = collector
669            .build_eval_set(record, &profile, duration_ms, execution_plan)
670            .await;
671
672        Ok(eval_set)
673    }
674
675    fn register_tasks(registry: &mut TaskRegistry, profile: &GenAIEvalProfile) {
676        for task in &profile.tasks.assertion {
677            registry.register(task.id.clone(), TaskType::Assertion, task.condition);
678            if !task.depends_on.is_empty() {
679                registry.register_dependencies(task.id.clone(), task.depends_on.clone());
680            }
681        }
682
683        for task in &profile.tasks.judge {
684            registry.register(task.id.clone(), TaskType::LLMJudge, task.condition);
685            if !task.depends_on.is_empty() {
686                registry.register_dependencies(task.id.clone(), task.depends_on.clone());
687            }
688        }
689
690        for task in &profile.tasks.trace {
691            registry.register(task.id.clone(), TaskType::TraceAssertion, task.condition);
692            if !task.depends_on.is_empty() {
693                registry.register_dependencies(task.id.clone(), task.depends_on.clone());
694            }
695        }
696    }
697}
698
699#[cfg(test)]
700mod tests {
701
702    use chrono::Utc;
703    use potato_head::mock::{create_score_prompt, LLMTestServer};
704    use scouter_mocks::{
705        create_multi_service_trace, create_nested_trace, create_sequence_pattern_trace,
706        create_simple_trace, create_trace_with_attributes, create_trace_with_errors, init_tracing,
707    };
708    use scouter_types::genai::{
709        AggregationType, SpanFilter, SpanStatus, TraceAssertion, TraceAssertionTask,
710    };
711    use scouter_types::genai::{
712        AssertionTask, ComparisonOperator, GenAIAlertConfig, GenAIEvalConfig, GenAIEvalProfile,
713        LLMJudgeTask,
714    };
715    use scouter_types::genai::{EvaluationTaskType, EvaluationTasks};
716    use scouter_types::GenAIEvalRecord;
717    use serde_json::Value;
718    use std::sync::Arc;
719
720    use crate::evaluate::GenAIEvaluator;
721
722    async fn create_assert_judge_profile() -> GenAIEvalProfile {
723        let prompt = create_score_prompt(Some(vec!["input".to_string()]));
724
725        let assertion_level_1 = AssertionTask {
726            id: "input_check".to_string(),
727            context_path: Some("input.foo".to_string()),
728            operator: ComparisonOperator::Equals,
729            expected_value: Value::String("bar".to_string()),
730            description: Some("Check if input.foo is bar".to_string()),
731            task_type: EvaluationTaskType::Assertion,
732            depends_on: vec![],
733            result: None,
734            condition: false,
735            item_context_path: None,
736        };
737
738        let judge_task_level_1 = LLMJudgeTask::new_rs(
739            "query_relevance",
740            prompt.clone(),
741            Value::Number(1.into()),
742            Some("score".to_string()),
743            ComparisonOperator::GreaterThanOrEqual,
744            None,
745            None,
746            None,
747            None,
748        );
749
750        let assert_query_score = AssertionTask {
751            id: "assert_score".to_string(),
752            context_path: Some("query_relevance.score".to_string()),
753            operator: ComparisonOperator::IsNumeric,
754            expected_value: Value::Bool(true),
755            depends_on: vec!["query_relevance".to_string()],
756            task_type: EvaluationTaskType::Assertion,
757            description: Some("Check that score is numeric".to_string()),
758            result: None,
759            condition: false,
760            item_context_path: None,
761        };
762
763        let assert_query_reason = AssertionTask {
764            id: "assert_reason".to_string(),
765            context_path: Some("query_relevance.reason".to_string()),
766            operator: ComparisonOperator::IsString,
767            expected_value: Value::Bool(true),
768            depends_on: vec!["query_relevance".to_string()],
769            task_type: EvaluationTaskType::Assertion,
770            description: Some("Check that reason is alphabetic".to_string()),
771            result: None,
772            condition: false,
773            item_context_path: None,
774        };
775
776        let tasks = EvaluationTasks::new()
777            .add_task(assertion_level_1)
778            .add_task(judge_task_level_1)
779            .add_task(assert_query_score)
780            .add_task(assert_query_reason)
781            .build();
782
783        let alert_config = GenAIAlertConfig::default();
784
785        let drift_config =
786            GenAIEvalConfig::new("scouter", "ML", "0.1.0", 1.0, alert_config, None).unwrap();
787
788        GenAIEvalProfile::new(drift_config, tasks).await.unwrap()
789    }
790
791    async fn create_assert_profile() -> GenAIEvalProfile {
792        let assert1 = AssertionTask {
793            id: "input_foo_check".to_string(),
794            context_path: Some("input.foo".to_string()),
795            operator: ComparisonOperator::Equals,
796            expected_value: Value::String("bar".to_string()),
797            description: Some("Check if input.foo is bar".to_string()),
798            task_type: EvaluationTaskType::Assertion,
799            depends_on: vec![],
800            result: None,
801            condition: false,
802            item_context_path: None,
803        };
804        let assert2 = AssertionTask {
805            id: "input_bar_check".to_string(),
806            context_path: Some("input.bar".to_string()),
807            operator: ComparisonOperator::IsNumeric,
808            expected_value: Value::Bool(true),
809            depends_on: vec![],
810            task_type: EvaluationTaskType::Assertion,
811            description: Some("Check that bar is numeric".to_string()),
812            result: None,
813            condition: false,
814            item_context_path: None,
815        };
816
817        let assert3 = AssertionTask {
818            id: "input_baz_check".to_string(),
819            context_path: Some("input.baz".to_string()),
820            operator: ComparisonOperator::HasLengthEqual,
821            expected_value: Value::Number(3.into()),
822            depends_on: vec![],
823            task_type: EvaluationTaskType::Assertion,
824            description: Some("Check that baz has length 3".to_string()),
825            result: None,
826            condition: false,
827            item_context_path: None,
828        };
829
830        let tasks = EvaluationTasks::new()
831            .add_task(assert1)
832            .add_task(assert2)
833            .add_task(assert3)
834            .build();
835
836        let alert_config = GenAIAlertConfig::default();
837
838        let drift_config =
839            GenAIEvalConfig::new("scouter", "ML", "0.1.0", 1.0, alert_config, None).unwrap();
840
841        GenAIEvalProfile::new(drift_config, tasks).await.unwrap()
842    }
843
844    async fn create_trace_profile_simple() -> GenAIEvalProfile {
845        let trace_task = TraceAssertionTask {
846            id: "check_span_sequence".to_string(),
847            assertion: TraceAssertion::SpanSequence {
848                span_names: vec![
849                    "root".to_string(),
850                    "child_1".to_string(),
851                    "child_2".to_string(),
852                ],
853            },
854            operator: ComparisonOperator::Equals,
855            expected_value: Value::Bool(true),
856            description: Some("Verify span execution order".to_string()),
857            task_type: EvaluationTaskType::TraceAssertion,
858            depends_on: vec![],
859            condition: false,
860            result: None,
861        };
862
863        let tasks = EvaluationTasks::new().add_task(trace_task).build();
864
865        let alert_config = GenAIAlertConfig::default();
866        let drift_config =
867            GenAIEvalConfig::new("scouter", "trace_test", "0.1.0", 1.0, alert_config, None)
868                .unwrap();
869
870        GenAIEvalProfile::new(drift_config, tasks).await.unwrap()
871    }
872
873    async fn create_trace_profile_with_filters() -> GenAIEvalProfile {
874        let span_count_task = TraceAssertionTask {
875            id: "count_error_spans".to_string(),
876            assertion: TraceAssertion::SpanCount {
877                filter: SpanFilter::WithStatus {
878                    status: SpanStatus::Error,
879                },
880            },
881            operator: ComparisonOperator::Equals,
882            expected_value: Value::Number(1.into()),
883            description: Some("Count spans with error status".to_string()),
884            task_type: EvaluationTaskType::TraceAssertion,
885            depends_on: vec![],
886            condition: false,
887            result: None,
888        };
889
890        let span_exists_task = TraceAssertionTask {
891            id: "check_recovery_span".to_string(),
892            assertion: TraceAssertion::SpanExists {
893                filter: SpanFilter::ByName {
894                    name: "recovery".to_string(),
895                },
896            },
897            operator: ComparisonOperator::Equals,
898            expected_value: Value::Bool(true),
899            description: Some("Verify recovery span exists".to_string()),
900            task_type: EvaluationTaskType::TraceAssertion,
901            depends_on: vec![],
902            condition: false,
903            result: None,
904        };
905
906        let tasks = EvaluationTasks::new()
907            .add_task(span_count_task)
908            .add_task(span_exists_task)
909            .build();
910
911        let alert_config = GenAIAlertConfig::default();
912        let drift_config =
913            GenAIEvalConfig::new("scouter", "trace_test", "0.1.0", 1.0, alert_config, None)
914                .unwrap();
915
916        GenAIEvalProfile::new(drift_config, tasks).await.unwrap()
917    }
918
919    async fn create_trace_profile_with_attributes() -> GenAIEvalProfile {
920        let attribute_task = TraceAssertionTask {
921            id: "check_model_name".to_string(),
922            assertion: TraceAssertion::SpanAttribute {
923                filter: SpanFilter::ByName {
924                    name: "api_call".to_string(),
925                },
926                attribute_key: "model".to_string(),
927            },
928            operator: ComparisonOperator::Equals,
929            expected_value: Value::String("gpt-4".to_string()),
930            description: Some("Verify model attribute".to_string()),
931            task_type: EvaluationTaskType::TraceAssertion,
932            depends_on: vec![],
933            condition: false,
934            result: None,
935        };
936
937        let aggregation_task = TraceAssertionTask {
938            id: "sum_token_output".to_string(),
939            assertion: TraceAssertion::SpanAggregation {
940                filter: SpanFilter::ByName {
941                    name: "api_call".to_string(),
942                },
943                attribute_key: "tokens.output".to_string(),
944                aggregation: AggregationType::Sum,
945            },
946            operator: ComparisonOperator::Equals,
947            expected_value: Value::Number(300.into()),
948            description: Some("Sum output tokens".to_string()),
949            task_type: EvaluationTaskType::TraceAssertion,
950            depends_on: vec![],
951            condition: false,
952            result: None,
953        };
954
955        let tasks = EvaluationTasks::new()
956            .add_task(attribute_task)
957            .add_task(aggregation_task)
958            .build();
959
960        let alert_config = GenAIAlertConfig::default();
961        let drift_config =
962            GenAIEvalConfig::new("scouter", "trace_test", "0.1.0", 1.0, alert_config, None)
963                .unwrap();
964
965        GenAIEvalProfile::new(drift_config, tasks).await.unwrap()
966    }
967
968    async fn create_trace_profile_complex() -> GenAIEvalProfile {
969        let sequence_count_task = TraceAssertionTask {
970            id: "count_tool_agent_sequence".to_string(),
971            assertion: TraceAssertion::SpanCount {
972                filter: SpanFilter::Sequence {
973                    names: vec!["call_tool".to_string(), "run_agent".to_string()],
974                },
975            },
976            operator: ComparisonOperator::Equals,
977            expected_value: Value::Number(2.into()),
978            description: Some("Count tool->agent sequences".to_string()),
979            task_type: EvaluationTaskType::TraceAssertion,
980            depends_on: vec![],
981            condition: false,
982            result: None,
983        };
984
985        let trace_duration_task = TraceAssertionTask {
986            id: "check_trace_duration".to_string(),
987            assertion: TraceAssertion::TraceDuration {},
988            operator: ComparisonOperator::LessThanOrEqual,
989            expected_value: Value::Number(1000.into()),
990            description: Some("Verify trace completes within 1s".to_string()),
991            task_type: EvaluationTaskType::TraceAssertion,
992            depends_on: vec![],
993            condition: false,
994            result: None,
995        };
996
997        let service_count_task = TraceAssertionTask {
998            id: "check_service_count".to_string(),
999            assertion: TraceAssertion::TraceServiceCount {},
1000            operator: ComparisonOperator::Equals,
1001            expected_value: Value::Number(1.into()),
1002            description: Some("Verify single service".to_string()),
1003            task_type: EvaluationTaskType::TraceAssertion,
1004            depends_on: vec![],
1005            condition: false,
1006            result: None,
1007        };
1008
1009        let tasks = EvaluationTasks::new()
1010            .add_task(sequence_count_task)
1011            .add_task(trace_duration_task)
1012            .add_task(service_count_task)
1013            .build();
1014
1015        let alert_config = GenAIAlertConfig::default();
1016        let drift_config =
1017            GenAIEvalConfig::new("scouter", "trace_test", "0.1.0", 1.0, alert_config, None)
1018                .unwrap();
1019
1020        GenAIEvalProfile::new(drift_config, tasks).await.unwrap()
1021    }
1022
1023    async fn create_trace_profile_with_dependencies() -> GenAIEvalProfile {
1024        let error_check = TraceAssertionTask {
1025            id: "check_has_errors".to_string(),
1026            assertion: TraceAssertion::TraceErrorCount {},
1027            operator: ComparisonOperator::GreaterThan,
1028            expected_value: Value::Number(0.into()),
1029            description: Some("Check if trace has errors".to_string()),
1030            task_type: EvaluationTaskType::TraceAssertion,
1031            depends_on: vec![],
1032            condition: true,
1033            result: None,
1034        };
1035
1036        let recovery_check = TraceAssertionTask {
1037            id: "check_recovery_exists".to_string(),
1038            assertion: TraceAssertion::SpanExists {
1039                filter: SpanFilter::ByName {
1040                    name: "recovery".to_string(),
1041                },
1042            },
1043            operator: ComparisonOperator::Equals,
1044            expected_value: Value::Bool(true),
1045            description: Some("Verify recovery span exists when errors present".to_string()),
1046            task_type: EvaluationTaskType::TraceAssertion,
1047            depends_on: vec!["check_has_errors".to_string()],
1048            condition: false,
1049            result: None,
1050        };
1051
1052        let tasks = EvaluationTasks::new()
1053            .add_task(error_check)
1054            .add_task(recovery_check)
1055            .build();
1056
1057        let alert_config = GenAIAlertConfig::default();
1058        let drift_config =
1059            GenAIEvalConfig::new("scouter", "trace_test", "0.1.0", 1.0, alert_config, None)
1060                .unwrap();
1061
1062        GenAIEvalProfile::new(drift_config, tasks).await.unwrap()
1063    }
1064
1065    #[test]
1066    fn test_evaluator_assert_judge_all_pass() {
1067        let mut mock = LLMTestServer::new();
1068        mock.start_server().unwrap();
1069        let runtime = tokio::runtime::Runtime::new().unwrap();
1070        let profile = runtime.block_on(async { create_assert_judge_profile().await });
1071
1072        assert!(profile.has_llm_tasks());
1073        assert!(profile.has_assertions());
1074
1075        let context = serde_json::json!({
1076        "input": {
1077            "foo": "bar" }
1078        });
1079
1080        let record = GenAIEvalRecord::new_rs(
1081            context,
1082            Utc::now(),
1083            "UID123".to_string(),
1084            "ENTITY123".to_string(),
1085            None,
1086            None,
1087        );
1088
1089        let result_set = runtime.block_on(async {
1090            GenAIEvaluator::process_event_record(&record, Arc::new(profile), Arc::new(vec![])).await
1091        });
1092
1093        let eval_set = result_set.unwrap();
1094        assert!(eval_set.passed_tasks() == 4);
1095        assert!(eval_set.failed_tasks() == 0);
1096
1097        mock.stop_server().unwrap();
1098    }
1099
1100    #[test]
1101    fn test_evaluator_assert_one_fail() {
1102        let mut mock = LLMTestServer::new();
1103        mock.start_server().unwrap();
1104        let runtime = tokio::runtime::Runtime::new().unwrap();
1105        let profile = runtime.block_on(async { create_assert_profile().await });
1106
1107        assert!(!profile.has_llm_tasks());
1108        assert!(profile.has_assertions());
1109
1110        // we want task "input_bar_check" to fail (is_numeric on non-numeric)
1111        let context = serde_json::json!({
1112            "input": {
1113                "foo": "bar",
1114                "bar": "not_a_number",
1115                "baz": [1, 2, 3]}
1116        });
1117
1118        let record = GenAIEvalRecord::new_rs(
1119            context,
1120            Utc::now(),
1121            "UID123".to_string(),
1122            "ENTITY123".to_string(),
1123            None,
1124            None,
1125        );
1126
1127        let result_set = runtime.block_on(async {
1128            GenAIEvaluator::process_event_record(&record, Arc::new(profile), Arc::new(vec![])).await
1129        });
1130
1131        let eval_set = result_set.unwrap();
1132        assert!(eval_set.passed_tasks() == 2);
1133        assert!(eval_set.failed_tasks() == 1);
1134
1135        mock.stop_server().unwrap();
1136    }
1137
1138    #[test]
1139    fn test_evaluator_trace_simple_sequence() {
1140        init_tracing();
1141        let runtime = tokio::runtime::Runtime::new().unwrap();
1142        let profile = runtime.block_on(create_trace_profile_simple());
1143        let spans = Arc::new(create_simple_trace());
1144
1145        let context = serde_json::json!({});
1146        let record = GenAIEvalRecord::new_rs(
1147            context,
1148            Utc::now(),
1149            "TRACE_UID_001".to_string(),
1150            "ENTITY_001".to_string(),
1151            None,
1152            None,
1153        );
1154
1155        let result = runtime.block_on(GenAIEvaluator::process_event_record(
1156            &record,
1157            Arc::new(profile),
1158            spans,
1159        ));
1160
1161        let eval_set = result.unwrap();
1162        assert_eq!(eval_set.passed_tasks(), 1);
1163        assert_eq!(eval_set.failed_tasks(), 0);
1164    }
1165
1166    #[test]
1167    fn test_evaluator_trace_error_detection() {
1168        let runtime = tokio::runtime::Runtime::new().unwrap();
1169        let profile = runtime.block_on(create_trace_profile_with_filters());
1170        let spans = Arc::new(create_trace_with_errors());
1171
1172        let context = serde_json::json!({});
1173        let record = GenAIEvalRecord::new_rs(
1174            context,
1175            Utc::now(),
1176            "TRACE_UID_002".to_string(),
1177            "ENTITY_002".to_string(),
1178            None,
1179            None,
1180        );
1181
1182        let result = runtime.block_on(GenAIEvaluator::process_event_record(
1183            &record,
1184            Arc::new(profile),
1185            spans,
1186        ));
1187
1188        let eval_set = result.unwrap();
1189        assert_eq!(eval_set.passed_tasks(), 2);
1190        assert_eq!(eval_set.failed_tasks(), 0);
1191    }
1192
1193    #[test]
1194    fn test_evaluator_trace_attribute_extraction() {
1195        init_tracing();
1196        let runtime = tokio::runtime::Runtime::new().unwrap();
1197        let profile = runtime.block_on(create_trace_profile_with_attributes());
1198        let spans = Arc::new(create_trace_with_attributes());
1199
1200        let context = serde_json::json!({});
1201        let record = GenAIEvalRecord::new_rs(
1202            context,
1203            Utc::now(),
1204            "TRACE_UID_003".to_string(),
1205            "ENTITY_003".to_string(),
1206            None,
1207            None,
1208        );
1209
1210        let result = runtime.block_on(GenAIEvaluator::process_event_record(
1211            &record,
1212            Arc::new(profile),
1213            spans,
1214        ));
1215
1216        let eval_set = result.unwrap();
1217        assert_eq!(eval_set.passed_tasks(), 2);
1218        assert_eq!(eval_set.failed_tasks(), 0);
1219    }
1220
1221    #[test]
1222    fn test_evaluator_trace_sequence_pattern() {
1223        init_tracing();
1224        let runtime = tokio::runtime::Runtime::new().unwrap();
1225        let profile = runtime.block_on(create_trace_profile_complex());
1226        let spans = Arc::new(create_sequence_pattern_trace());
1227
1228        let context = serde_json::json!({});
1229        let record = GenAIEvalRecord::new_rs(
1230            context,
1231            Utc::now(),
1232            "TRACE_UID_004".to_string(),
1233            "ENTITY_004".to_string(),
1234            None,
1235            None,
1236        );
1237
1238        let result = runtime.block_on(GenAIEvaluator::process_event_record(
1239            &record,
1240            Arc::new(profile),
1241            spans,
1242        ));
1243
1244        let eval_set = result.unwrap();
1245        assert_eq!(eval_set.passed_tasks(), 3);
1246        assert_eq!(eval_set.failed_tasks(), 0);
1247    }
1248
1249    #[test]
1250    fn test_evaluator_trace_conditional_dependency() {
1251        let runtime = tokio::runtime::Runtime::new().unwrap();
1252        let profile = runtime.block_on(create_trace_profile_with_dependencies());
1253        let spans = Arc::new(create_trace_with_errors());
1254
1255        let context = serde_json::json!({});
1256        let record = GenAIEvalRecord::new_rs(
1257            context,
1258            Utc::now(),
1259            "TRACE_UID_005".to_string(),
1260            "ENTITY_005".to_string(),
1261            None,
1262            None,
1263        );
1264
1265        let result = runtime.block_on(GenAIEvaluator::process_event_record(
1266            &record,
1267            Arc::new(profile),
1268            spans,
1269        ));
1270
1271        let eval_set = result.unwrap();
1272        assert_eq!(eval_set.passed_tasks(), 1); // first task is conditional and is excluded
1273        assert_eq!(eval_set.failed_tasks(), 0);
1274    }
1275
1276    #[test]
1277    fn test_evaluator_trace_multi_service() {
1278        let runtime = tokio::runtime::Runtime::new().unwrap();
1279
1280        let task = TraceAssertionTask {
1281            id: "check_service_count".to_string(),
1282            assertion: TraceAssertion::TraceServiceCount {},
1283            operator: ComparisonOperator::Equals,
1284            expected_value: Value::Number(3.into()),
1285            description: Some("Verify three services".to_string()),
1286            task_type: EvaluationTaskType::TraceAssertion,
1287            depends_on: vec![],
1288            condition: false,
1289            result: None,
1290        };
1291
1292        let tasks = EvaluationTasks::new().add_task(task).build();
1293        let alert_config = GenAIAlertConfig::default();
1294        let drift_config =
1295            GenAIEvalConfig::new("scouter", "trace_test", "0.1.0", 1.0, alert_config, None)
1296                .unwrap();
1297
1298        let profile = runtime
1299            .block_on(GenAIEvalProfile::new(drift_config, tasks))
1300            .unwrap();
1301        let spans = Arc::new(create_multi_service_trace());
1302
1303        let context = serde_json::json!({});
1304        let record = GenAIEvalRecord::new_rs(
1305            context,
1306            Utc::now(),
1307            "TRACE_UID_006".to_string(),
1308            "ENTITY_006".to_string(),
1309            None,
1310            None,
1311        );
1312
1313        let result = runtime.block_on(GenAIEvaluator::process_event_record(
1314            &record,
1315            Arc::new(profile),
1316            spans,
1317        ));
1318
1319        let eval_set = result.unwrap();
1320        assert_eq!(eval_set.passed_tasks(), 1);
1321        assert_eq!(eval_set.failed_tasks(), 0);
1322    }
1323
1324    #[test]
1325    fn test_evaluator_trace_assertion_failure() {
1326        let runtime = tokio::runtime::Runtime::new().unwrap();
1327
1328        let task = TraceAssertionTask {
1329            id: "check_wrong_sequence".to_string(),
1330            assertion: TraceAssertion::SpanSequence {
1331                span_names: vec![
1332                    "root".to_string(),
1333                    "wrong_child".to_string(),
1334                    "child_2".to_string(),
1335                ],
1336            },
1337            operator: ComparisonOperator::Equals,
1338            expected_value: Value::Bool(true),
1339            description: Some("Verify incorrect span order".to_string()),
1340            task_type: EvaluationTaskType::TraceAssertion,
1341            depends_on: vec![],
1342            condition: false,
1343            result: None,
1344        };
1345
1346        let tasks = EvaluationTasks::new().add_task(task).build();
1347        let alert_config = GenAIAlertConfig::default();
1348        let drift_config =
1349            GenAIEvalConfig::new("scouter", "trace_test", "0.1.0", 1.0, alert_config, None)
1350                .unwrap();
1351
1352        let profile = runtime
1353            .block_on(GenAIEvalProfile::new(drift_config, tasks))
1354            .unwrap();
1355        let spans = Arc::new(create_simple_trace());
1356
1357        let context = serde_json::json!({});
1358        let record = GenAIEvalRecord::new_rs(
1359            context,
1360            Utc::now(),
1361            "TRACE_UID_007".to_string(),
1362            "ENTITY_007".to_string(),
1363            None,
1364            None,
1365        );
1366
1367        let result = runtime.block_on(GenAIEvaluator::process_event_record(
1368            &record,
1369            Arc::new(profile),
1370            spans,
1371        ));
1372
1373        let eval_set = result.unwrap();
1374        assert_eq!(eval_set.passed_tasks(), 0);
1375        assert_eq!(eval_set.failed_tasks(), 1);
1376    }
1377
1378    #[test]
1379    fn test_evaluator_trace_mixed_assertions() {
1380        init_tracing();
1381        let runtime = tokio::runtime::Runtime::new().unwrap();
1382
1383        let trace_task = TraceAssertionTask {
1384            id: "check_max_depth".to_string(),
1385            assertion: TraceAssertion::TraceMaxDepth {},
1386            operator: ComparisonOperator::Equals,
1387            expected_value: Value::Number(2.into()),
1388            description: Some("Verify max depth".to_string()),
1389            task_type: EvaluationTaskType::TraceAssertion,
1390            depends_on: vec![],
1391            condition: false,
1392            result: None,
1393        };
1394
1395        let regular_assertion = AssertionTask {
1396            id: "check_context".to_string(),
1397            context_path: Some("metadata.version".to_string()),
1398            operator: ComparisonOperator::Equals,
1399            expected_value: Value::String("1.0.0".to_string()),
1400            description: Some("Verify version".to_string()),
1401            task_type: EvaluationTaskType::Assertion,
1402            depends_on: vec![],
1403            result: None,
1404            condition: false,
1405            item_context_path: None,
1406        };
1407
1408        let tasks = EvaluationTasks::new()
1409            .add_task(trace_task)
1410            .add_task(regular_assertion)
1411            .build();
1412
1413        let alert_config = GenAIAlertConfig::default();
1414        let drift_config =
1415            GenAIEvalConfig::new("scouter", "trace_test", "0.1.0", 1.0, alert_config, None)
1416                .unwrap();
1417
1418        let profile = runtime
1419            .block_on(GenAIEvalProfile::new(drift_config, tasks))
1420            .unwrap();
1421        let spans = Arc::new(create_nested_trace());
1422
1423        let context = serde_json::json!({
1424            "metadata": {
1425                "version": "1.0.0"
1426            }
1427        });
1428
1429        let record = GenAIEvalRecord::new_rs(
1430            context,
1431            Utc::now(),
1432            "TRACE_UID_008".to_string(),
1433            "ENTITY_008".to_string(),
1434            None,
1435            None,
1436        );
1437
1438        let result = runtime.block_on(GenAIEvaluator::process_event_record(
1439            &record,
1440            Arc::new(profile),
1441            spans,
1442        ));
1443
1444        let eval_set = result.unwrap();
1445        assert_eq!(eval_set.passed_tasks(), 2);
1446        assert_eq!(eval_set.failed_tasks(), 0);
1447    }
1448
1449    #[test]
1450    fn test_evaluator_trace_duration_filter() {
1451        init_tracing();
1452        let runtime = tokio::runtime::Runtime::new().unwrap();
1453
1454        let task = TraceAssertionTask {
1455            id: "check_slow_spans".to_string(),
1456            assertion: TraceAssertion::SpanCount {
1457                filter: SpanFilter::WithDuration {
1458                    min_ms: Some(100.0),
1459                    max_ms: None,
1460                },
1461            },
1462            operator: ComparisonOperator::GreaterThanOrEqual,
1463            expected_value: Value::Number(2.into()),
1464            description: Some("Count spans over 100ms".to_string()),
1465            task_type: EvaluationTaskType::TraceAssertion,
1466            depends_on: vec![],
1467            condition: false,
1468            result: None,
1469        };
1470
1471        let tasks = EvaluationTasks::new().add_task(task).build();
1472        let alert_config = GenAIAlertConfig::default();
1473        let drift_config =
1474            GenAIEvalConfig::new("scouter", "trace_test", "0.1.0", 1.0, alert_config, None)
1475                .unwrap();
1476
1477        let profile = runtime
1478            .block_on(GenAIEvalProfile::new(drift_config, tasks))
1479            .unwrap();
1480        let spans = Arc::new(create_nested_trace());
1481
1482        let context = serde_json::json!({});
1483        let record = GenAIEvalRecord::new_rs(
1484            context,
1485            Utc::now(),
1486            "TRACE_UID_009".to_string(),
1487            "ENTITY_009".to_string(),
1488            None,
1489            None,
1490        );
1491
1492        let result = runtime.block_on(GenAIEvaluator::process_event_record(
1493            &record,
1494            Arc::new(profile),
1495            spans,
1496        ));
1497
1498        let eval_set = result.unwrap();
1499        assert_eq!(eval_set.passed_tasks(), 1);
1500        assert_eq!(eval_set.failed_tasks(), 0);
1501    }
1502}