1use crate::error::EvaluationError;
2use crate::types::{EvaluationConfig, LLMEvalTaskResult};
3use crate::types::{LLMEvalRecord, LLMEvalResults};
4use crate::util::{
5 collect_evaluation_results, spawn_evaluation_tasks_with_embeddings,
6 spawn_evaluation_tasks_without_embeddings,
7};
8use potato_head::{Agent, Provider, Task, Workflow, WorkflowError};
9use pyo3::prelude::*;
10use scouter_state::app_state;
11use scouter_types::eval::LLMEvalMetric;
12use std::collections::hash_map::Entry;
13use std::collections::HashMap;
14use std::sync::Arc;
15use tokio::task::JoinSet;
16use tracing::{debug, instrument};
17#[instrument(skip_all)]
24pub async fn async_evaluate_llm(
25 workflow: Workflow,
26 records: Vec<LLMEvalRecord>,
27 config: &Arc<EvaluationConfig>,
28) -> Result<LLMEvalResults, EvaluationError> {
29 debug!("Starting LLM evaluation for {} records", records.len());
30
31 let join_set: JoinSet<(String, Option<LLMEvalTaskResult>)> = match (
32 config.embedder.as_ref(),
33 config.embedding_targets.is_empty(),
34 ) {
35 (Some(embedder), false) => {
36 debug!("Using embedding-enabled evaluation path");
37 spawn_evaluation_tasks_with_embeddings(workflow, records, Arc::clone(embedder), config)
38 .await
39 }
40 _ => {
41 debug!("Using standard evaluation path");
42
43 spawn_evaluation_tasks_without_embeddings(workflow, records).await
45 }
46 };
47
48 let results = collect_evaluation_results(join_set).await?;
49
50 Ok(results)
51}
52
53pub async fn workflow_from_eval_metrics(
55 eval_metrics: Vec<LLMEvalMetric>,
56 name: &str,
57) -> Result<Workflow, EvaluationError> {
58 let mut workflow = Workflow::new(name);
60 let mut agents: HashMap<Provider, Agent> = HashMap::new();
61 let mut metric_names = Vec::new();
62
63 for metric in &eval_metrics {
66 let provider = metric.prompt.model_settings.provider();
67
68 let agent = match agents.entry(provider) {
69 Entry::Occupied(entry) => entry.into_mut(),
70 Entry::Vacant(entry) => {
71 let agent = Agent::from_model_settings(&metric.prompt.model_settings)
72 .await
73 .map_err(|e| WorkflowError::Error(format!("Failed to create agent: {}", e)))?;
74 workflow.add_agent(&agent);
75 entry.insert(agent)
76 }
77 };
78
79 let task = Task::new(&agent.id, metric.prompt.clone(), &metric.name, None, None);
80 workflow.add_task(task)?;
81 metric_names.push(metric.name.clone());
82 }
83
84 Ok(workflow)
85}
86
87#[pyfunction]
88#[pyo3(signature = (records, metrics, config=None))]
98pub fn evaluate_llm(
99 records: Vec<LLMEvalRecord>,
100 metrics: Vec<LLMEvalMetric>,
101 config: Option<EvaluationConfig>,
102) -> Result<LLMEvalResults, EvaluationError> {
103 let config = Arc::new(config.unwrap_or_default());
104
105 let mut results = app_state().handle().block_on(async {
107 let workflow = workflow_from_eval_metrics(metrics, "LLM Evaluation").await?;
108 async_evaluate_llm(workflow, records, &config).await
109 })?;
110
111 if config.needs_post_processing() {
114 results.finalize(&config)?;
115 }
116
117 Ok(results)
118}