scouter_evaluate/
llm.rs

1use crate::error::EvaluationError;
2use crate::types::{EvaluationConfig, LLMEvalTaskResult};
3use crate::types::{LLMEvalRecord, LLMEvalResults};
4use crate::util::{
5    collect_evaluation_results, spawn_evaluation_tasks_with_embeddings,
6    spawn_evaluation_tasks_without_embeddings,
7};
8use potato_head::{Agent, Provider, Task, Workflow, WorkflowError};
9use pyo3::prelude::*;
10use scouter_state::app_state;
11use scouter_types::eval::LLMEvalMetric;
12use std::collections::hash_map::Entry;
13use std::collections::HashMap;
14use std::sync::Arc;
15use tokio::task::JoinSet;
16use tracing::{debug, instrument};
17/// Main orchestration function that decides which execution path to take
18/// # Arguments
19/// * `workflow`: The workflow to execute.
20/// * `records`: The data records to evaluate.
21/// * `embedder`: Optional embedder for embedding-based evaluations.
22/// * `embedding_targets`: Optional list of fields to embed.
23#[instrument(skip_all)]
24pub async fn async_evaluate_llm(
25    workflow: Workflow,
26    records: Vec<LLMEvalRecord>,
27    config: &Arc<EvaluationConfig>,
28) -> Result<LLMEvalResults, EvaluationError> {
29    debug!("Starting LLM evaluation for {} records", records.len());
30
31    let join_set: JoinSet<(String, Option<LLMEvalTaskResult>)> = match (
32        config.embedder.as_ref(),
33        config.embedding_targets.is_empty(),
34    ) {
35        (Some(embedder), false) => {
36            debug!("Using embedding-enabled evaluation path");
37            spawn_evaluation_tasks_with_embeddings(workflow, records, Arc::clone(embedder), config)
38                .await
39        }
40        _ => {
41            debug!("Using standard evaluation path");
42
43            // this will return a list of VecEval
44            spawn_evaluation_tasks_without_embeddings(workflow, records).await
45        }
46    };
47
48    let results = collect_evaluation_results(join_set).await?;
49
50    Ok(results)
51}
52
53/// Builds a workflow from a list of LLMEvalMetric objects
54pub async fn workflow_from_eval_metrics(
55    eval_metrics: Vec<LLMEvalMetric>,
56    name: &str,
57) -> Result<Workflow, EvaluationError> {
58    // Build a workflow from metrics
59    let mut workflow = Workflow::new(name);
60    let mut agents: HashMap<Provider, Agent> = HashMap::new();
61    let mut metric_names = Vec::new();
62
63    // Create agents. We don't want to duplicate, so we check if the agent already exists.
64    // if it doesn't, we create it.
65    for metric in &eval_metrics {
66        let provider = metric.prompt.model_settings.provider();
67
68        let agent = match agents.entry(provider) {
69            Entry::Occupied(entry) => entry.into_mut(),
70            Entry::Vacant(entry) => {
71                let agent = Agent::from_model_settings(&metric.prompt.model_settings)
72                    .await
73                    .map_err(|e| WorkflowError::Error(format!("Failed to create agent: {}", e)))?;
74                workflow.add_agent(&agent);
75                entry.insert(agent)
76            }
77        };
78
79        let task = Task::new(&agent.id, metric.prompt.clone(), &metric.name, None, None);
80        workflow.add_task(task)?;
81        metric_names.push(metric.name.clone());
82    }
83
84    Ok(workflow)
85}
86
87#[pyfunction]
88/// Function for evaluating LLM response and generating metrics.
89/// The primary use case for evaluate_llm is to take a list of data samples, which often contain inputs and outputs
90/// from LLM systems and evaluate them against user-defined metrics in a LLM as a judge pipeline. The user is expected provide
91/// a list of dict objects and a list of LLMEval metrics. These eval metrics will be used to create a workflow, which is then
92/// executed in an async context. All eval scores are extracted and returned to the user.
93/// # Arguments
94/// * `py`: The Python interpreter instance.
95/// * `data`: A list of data samples to evaluate.
96/// * `metrics`: A list of evaluation metrics to use.
97#[pyo3(signature = (records, metrics, config=None))]
98pub fn evaluate_llm(
99    records: Vec<LLMEvalRecord>,
100    metrics: Vec<LLMEvalMetric>,
101    config: Option<EvaluationConfig>,
102) -> Result<LLMEvalResults, EvaluationError> {
103    let config = Arc::new(config.unwrap_or_default());
104
105    // Create runtime and execute evaluation pipeline
106    let mut results = app_state().handle().block_on(async {
107        let workflow = workflow_from_eval_metrics(metrics, "LLM Evaluation").await?;
108        async_evaluate_llm(workflow, records, &config).await
109    })?;
110
111    // Only run post-processing if needed
112    // Post processing includes calculating embedding means, similarities, clustering, and histograms
113    if config.needs_post_processing() {
114        results.finalize(&config)?;
115    }
116
117    Ok(results)
118}