llm/evaluator/
mod.rs

1//! Module for evaluating and comparing responses from multiple LLM providers.
2//!
3//! This module provides functionality to run the same prompt through multiple LLMs
4//! and score their responses using custom evaluation functions.
5
6mod parallel;
7
8use crate::{chat::ChatMessage, error::LLMError, LLMProvider};
9
10pub use parallel::{ParallelEvalResult, ParallelEvaluator};
11
12/// Type alias for scoring functions that evaluate LLM responses
13pub type ScoringFn = dyn Fn(&str) -> f32 + Send + Sync + 'static;
14
15/// Evaluator for comparing responses from multiple LLM providers
16pub struct LLMEvaluator {
17    /// Collection of LLM providers to evaluate
18    llms: Vec<Box<dyn LLMProvider>>,
19    /// Optional scoring function to evaluate responses
20    scorings_fns: Vec<Box<ScoringFn>>,
21}
22
23impl LLMEvaluator {
24    /// Creates a new evaluator with the given LLM providers
25    ///
26    /// # Arguments
27    /// * `llms` - Vector of LLM providers to evaluate
28    pub fn new(llms: Vec<Box<dyn LLMProvider>>) -> Self {
29        Self {
30            llms,
31            scorings_fns: Vec::new(),
32        }
33    }
34
35    /// Adds a scoring function to evaluate LLM responses
36    ///
37    /// # Arguments
38    /// * `f` - Function that takes a response string and returns a score
39    pub fn scoring<F>(mut self, f: F) -> Self
40    where
41        F: Fn(&str) -> f32 + Send + Sync + 'static,
42    {
43        self.scorings_fns.push(Box::new(f));
44        self
45    }
46
47    /// Evaluates chat responses from all providers for the given messages
48    ///
49    /// # Arguments
50    /// * `messages` - Chat messages to send to each provider
51    ///
52    /// # Returns
53    /// Vector of evaluation results containing responses and scores
54    pub async fn evaluate_chat(
55        &self,
56        messages: &[ChatMessage],
57    ) -> Result<Vec<EvalResult>, LLMError> {
58        let mut results = Vec::new();
59        for llm in &self.llms {
60            let response = llm.chat(messages).await?;
61            let score = self.compute_score(&response.text().unwrap_or_default());
62            results.push(EvalResult {
63                text: response.text().unwrap_or_default(),
64                score,
65            });
66        }
67        Ok(results)
68    }
69
70    /// Computes the score for a given response
71    ///
72    /// # Arguments
73    /// * `response` - The response to score
74    ///
75    /// # Returns
76    /// The computed score
77    fn compute_score(&self, response: &str) -> f32 {
78        let mut total = 0.0;
79        for sc in &self.scorings_fns {
80            total += sc(response);
81        }
82        total
83    }
84}
85
86/// Result of evaluating an LLM response
87pub struct EvalResult {
88    /// The text response from the LLM
89    pub text: String,
90    /// Score assigned by the scoring function, if any
91    pub score: f32,
92}