Skip to main content

entrenar/monitor/llm/
eval_result.rs

1//! Evaluation result scores for LLM responses.
2
3use serde::{Deserialize, Serialize};
4use std::collections::HashMap;
5
6/// Evaluation result scores
7#[derive(Debug, Clone, Serialize, Deserialize)]
8pub struct EvalResult {
9    /// Relevance to the query (0-1, higher is better)
10    pub relevance: f64,
11    /// Logical consistency (0-1, higher is better)
12    pub coherence: f64,
13    /// Factual accuracy (0-1, higher is better)
14    pub groundedness: f64,
15    /// Potential harm score (0-1, lower is better)
16    pub harmfulness: f64,
17    /// Optional detailed scores
18    pub details: HashMap<String, f64>,
19    /// Overall score (weighted average)
20    pub overall: f64,
21}
22
23impl EvalResult {
24    /// Create a new evaluation result
25    pub fn new(relevance: f64, coherence: f64, groundedness: f64, harmfulness: f64) -> Self {
26        let overall = Self::compute_overall(relevance, coherence, groundedness, harmfulness);
27        Self {
28            relevance: relevance.clamp(0.0, 1.0),
29            coherence: coherence.clamp(0.0, 1.0),
30            groundedness: groundedness.clamp(0.0, 1.0),
31            harmfulness: harmfulness.clamp(0.0, 1.0),
32            details: HashMap::new(),
33            overall,
34        }
35    }
36
37    /// Add a detail score
38    pub fn with_detail(mut self, name: &str, score: f64) -> Self {
39        self.details.insert(name.to_string(), score.clamp(0.0, 1.0));
40        self
41    }
42
43    /// Compute overall score
44    fn compute_overall(relevance: f64, coherence: f64, groundedness: f64, harmfulness: f64) -> f64 {
45        // Weighted average (harmfulness is inverted since lower is better)
46        let weights = [0.3, 0.2, 0.3, 0.2]; // relevance, coherence, groundedness, safety
47        let scores = [
48            relevance,
49            coherence,
50            groundedness,
51            1.0 - harmfulness, // Invert harmfulness
52        ];
53
54        let weighted_sum: f64 = weights.iter().zip(scores.iter()).map(|(w, s)| w * s).sum();
55        weighted_sum.clamp(0.0, 1.0)
56    }
57
58    /// Check if result passes quality threshold
59    pub fn passes_threshold(&self, min_overall: f64, max_harmfulness: f64) -> bool {
60        self.overall >= min_overall && self.harmfulness <= max_harmfulness
61    }
62}
63
64impl Default for EvalResult {
65    fn default() -> Self {
66        Self::new(0.0, 0.0, 0.0, 0.0)
67    }
68}