1use crate::error::EvaluationError;
2use crate::util::{parse_embedder, post_process};
3use ndarray::Array2;
4use potato_head::{create_uuid7, Embedder, PyHelperFuncs, Score};
5use pyo3::prelude::*;
6use pyo3::types::PyDict;
7use pyo3::IntoPyObjectExt;
8use scouter_profile::{Histogram, NumProfiler};
9use scouter_types::{is_pydantic_basemodel, json_to_pyobject_value, pyobject_to_json};
10use serde::{Deserialize, Serialize};
11use serde_json::Value;
12use std::collections::{BTreeMap, HashMap};
13use std::sync::Arc;
14
15pub fn array_to_dict<'py>(
16 py: Python<'py>,
17 array: &ArrayDataset,
18) -> Result<Bound<'py, PyDict>, EvaluationError> {
19 let pydict = PyDict::new(py);
20
21 pydict.set_item(
23 "task",
24 array.idx_map.values().cloned().collect::<Vec<String>>(),
25 )?;
26
27 for (i, feature) in array.feature_names.iter().enumerate() {
29 let column_data: Vec<f64> = array.data.column(i).to_vec();
30 pydict.set_item(feature, column_data)?;
31 }
32
33 if array.clusters.len() == array.data.nrows() {
35 pydict.set_item("cluster", array.clusters.clone())?;
36 }
37 Ok(pydict)
38}
39
40#[derive(Debug, Serialize, Deserialize)]
42#[pyclass]
43pub struct LLMEvalResults {
44 pub results: HashMap<String, LLMEvalTaskResult>,
45
46 #[pyo3(get)]
47 pub errored_tasks: Vec<String>,
48
49 pub cluster_data: Option<ClusterData>,
50
51 #[pyo3(get)]
52 pub histograms: Option<HashMap<String, Histogram>>,
53
54 #[serde(skip)]
55 pub array_dataset: Option<ArrayDataset>,
56}
57
58#[pymethods]
59impl LLMEvalResults {
60 pub fn __getitem__(&self, key: &str) -> Result<LLMEvalTaskResult, EvaluationError> {
62 match self.results.get(key) {
63 Some(value) => Ok(value.clone()),
64 None => Err(EvaluationError::MissingKeyError(key.to_string())),
65 }
66 }
67
68 pub fn __str__(&self) -> String {
69 PyHelperFuncs::__str__(self)
70 }
71
72 pub fn model_dump_json(&self) -> String {
73 PyHelperFuncs::__json__(self)
75 }
76
77 #[staticmethod]
78 pub fn model_validate_json(json_string: String) -> Result<LLMEvalResults, EvaluationError> {
79 Ok(serde_json::from_str(&json_string)?)
81 }
82
83 #[pyo3(signature = (polars=false))]
84 pub fn to_dataframe<'py>(
85 &mut self,
86 py: Python<'py>,
87 polars: bool,
88 ) -> Result<Bound<'py, PyAny>, EvaluationError> {
89 if self.array_dataset.is_none() {
90 self.build_array_dataset()?;
91 }
92
93 let dataset = self.array_dataset.as_ref().unwrap();
94 let records = array_to_dict(py, dataset)?;
95
96 let module = if polars { "polars" } else { "pandas" };
97
98 let df_module = py.import(module)?;
99 let df_class = df_module.getattr("DataFrame")?;
100
101 if polars {
102 Ok(df_class.call1((records,))?)
103 } else {
104 Ok(df_class.call_method1("from_dict", (records,))?)
105 }
106 }
107
108 #[getter]
109 pub fn cluster_data(&self) -> Option<ClusterData> {
110 self.cluster_data.clone()
111 }
112
113 #[getter]
114 pub fn successful_count(&self) -> usize {
115 self.results.len()
116 }
117
118 #[getter]
119 pub fn failed_count(&self) -> usize {
120 self.errored_tasks.len()
121 }
122}
123
124impl LLMEvalResults {
125 pub fn finalize(&mut self, config: &Arc<EvaluationConfig>) -> Result<(), EvaluationError> {
134 if !config.embedding_targets.is_empty() {
136 post_process(self, config);
137 }
138
139 if config.compute_histograms {
140 self.build_array_dataset()?;
141
142 if let Some(array_dataset) = &self.array_dataset {
144 let profiler = NumProfiler::new();
145 let histograms = profiler.compute_histogram(
146 &array_dataset.data.view(),
147 &array_dataset.feature_names,
148 &10,
149 false,
150 )?;
151 self.histograms = Some(histograms);
152 }
153 }
154
155 Ok(())
156 }
157
158 fn build_array_dataset(&mut self) -> Result<(), EvaluationError> {
160 if self.array_dataset.is_none() {
161 self.array_dataset = Some(ArrayDataset::from_results(self)?);
162 }
163 Ok(())
164 }
165}
166
167#[derive(Debug, Clone, Serialize, Deserialize)]
168#[pyclass]
169pub struct ClusterData {
170 #[pyo3(get)]
171 pub x: Vec<f64>,
172 #[pyo3(get)]
173 pub y: Vec<f64>,
174 #[pyo3(get)]
175 pub clusters: Vec<i32>,
176 pub idx_map: HashMap<usize, String>,
177}
178
179impl ClusterData {
180 pub fn new(
181 x: Vec<f64>,
182 y: Vec<f64>,
183 clusters: Vec<i32>,
184 idx_map: HashMap<usize, String>,
185 ) -> Self {
186 ClusterData {
187 x,
188 y,
189 clusters,
190 idx_map,
191 }
192 }
193}
194
195#[derive(Debug)]
196pub struct ArrayDataset {
197 pub data: Array2<f64>,
198 pub feature_names: Vec<String>,
199 pub idx_map: HashMap<usize, String>,
200 pub clusters: Vec<i32>,
201}
202
203impl Default for ArrayDataset {
204 fn default() -> Self {
205 Self::new()
206 }
207}
208
209impl ArrayDataset {
210 pub fn new() -> Self {
211 Self {
212 data: Array2::zeros((0, 0)),
213 feature_names: Vec::new(),
214 idx_map: HashMap::new(),
215 clusters: vec![],
216 }
217 }
218
219 fn build_feature_names(results: &LLMEvalResults) -> Result<Vec<String>, EvaluationError> {
223 let first_task = results
224 .results
225 .values()
226 .next()
227 .ok_or(EvaluationError::NoResultsFound)?;
228
229 let mut names = Vec::new();
230
231 names.extend(first_task.metrics.keys().cloned());
233 names.extend(first_task.mean_embeddings.keys().cloned());
234 names.extend(first_task.similarity_scores.keys().cloned());
235
236 Ok(names)
237 }
238
239 fn from_results(results: &LLMEvalResults) -> Result<Self, EvaluationError> {
240 if results.results.is_empty() {
241 return Ok(Self::new());
242 }
243
244 let feature_names = Self::build_feature_names(results)?;
245 let n_rows = results.results.len();
246 let n_cols = feature_names.len();
247
248 let mut data = Vec::with_capacity(n_rows * n_cols);
249 let mut idx_map = HashMap::new();
250
251 for (i, task) in results.results.values().enumerate() {
253 idx_map.insert(i, task.id.clone());
254
255 let row: Vec<f64> = feature_names
257 .iter()
258 .map(|name| {
259 if let Some(score) = task.metrics.get(name) {
260 score.score as f64
261 } else if let Some(&mean) = task.mean_embeddings.get(name) {
262 mean
263 } else if let Some(&sim) = task.similarity_scores.get(name) {
264 sim
265 } else {
266 0.0 }
268 })
269 .collect();
270
271 data.extend(row);
272 }
273
274 let array = Array2::from_shape_vec((n_rows, n_cols), data)?;
275
276 Ok(Self {
277 data: array,
278 feature_names,
279 idx_map,
280 clusters: vec![],
281 })
282 }
283}
284
285impl LLMEvalResults {
286 pub fn new() -> Self {
287 Self {
288 results: HashMap::new(),
289 errored_tasks: Vec::new(),
290 array_dataset: None,
291 cluster_data: None,
292 histograms: None,
293 }
294 }
295}
296
297impl Default for LLMEvalResults {
298 fn default() -> Self {
299 Self::new()
300 }
301}
302
303#[derive(Debug, Clone, Serialize, Deserialize)]
305#[pyclass]
306pub struct LLMEvalTaskResult {
307 #[pyo3(get)]
308 pub id: String,
309
310 #[pyo3(get)]
311 pub metrics: BTreeMap<String, Score>,
312
313 #[pyo3(get)]
314 #[serde(skip)]
315 pub embedding: BTreeMap<String, Vec<f32>>,
316
317 #[pyo3(get)]
318 pub mean_embeddings: BTreeMap<String, f64>,
319
320 #[pyo3(get)]
321 pub similarity_scores: BTreeMap<String, f64>,
322}
323
324#[pymethods]
325impl LLMEvalTaskResult {
326 pub fn __str__(&self) -> String {
327 PyHelperFuncs::__str__(self)
328 }
329}
330
331impl LLMEvalTaskResult {
332 pub fn new(
333 id: String,
334 metrics: BTreeMap<String, Score>,
335 embedding: BTreeMap<String, Vec<f32>>,
336 ) -> Self {
337 Self {
338 id,
339 metrics,
340 embedding,
341 mean_embeddings: BTreeMap::new(),
342 similarity_scores: BTreeMap::new(),
343 }
344 }
345}
346
347#[pyclass]
348#[derive(Clone, Debug)]
349pub struct LLMEvalRecord {
350 pub id: String,
351 pub context: Value,
352}
353
354#[pymethods]
355impl LLMEvalRecord {
356 #[new]
357 #[pyo3(signature = (
358 context,
359 id=None
360 ))]
361
362 pub fn new(
365 py: Python<'_>,
366 context: Bound<'_, PyAny>,
367 id: Option<String>,
368 ) -> Result<Self, EvaluationError> {
369 let context_val = if context.is_instance_of::<PyDict>() {
371 pyobject_to_json(&context)?
372 } else if is_pydantic_basemodel(py, &context)? {
373 let model = context.call_method0("model_dump")?;
375
376 pyobject_to_json(&model)?
378 } else {
379 Err(EvaluationError::MustBeDictOrBaseModel)?
380 };
381
382 let id = id.unwrap_or_else(create_uuid7);
383
384 Ok(LLMEvalRecord {
385 id,
386 context: context_val,
387 })
388 }
389
390 #[getter]
391 pub fn context<'py>(&self, py: Python<'py>) -> Result<Bound<'py, PyAny>, EvaluationError> {
392 Ok(json_to_pyobject_value(py, &self.context)?
393 .into_bound_py_any(py)?
394 .clone())
395 }
396}
397
398#[derive(Debug, Clone, Default)]
399#[pyclass]
400pub struct EvaluationConfig {
401 pub embedder: Option<Arc<Embedder>>,
403
404 pub embedding_targets: Vec<String>,
406
407 pub compute_similarity: bool,
410
411 pub cluster: bool,
413
414 pub compute_histograms: bool,
416}
417
418#[pymethods]
419impl EvaluationConfig {
420 #[new]
421 #[pyo3(signature = (embedder=None, embedding_targets=None, compute_similarity=false, cluster=false, compute_histograms=false))]
422 fn new(
432 embedder: Option<&Bound<'_, PyAny>>,
433 embedding_targets: Option<Vec<String>>,
434 compute_similarity: bool,
435 cluster: bool,
436 compute_histograms: bool,
437 ) -> Result<Self, EvaluationError> {
438 let embedder = parse_embedder(embedder)?;
439 let embedding_targets = embedding_targets.unwrap_or_default();
440
441 Ok(Self {
442 embedder,
443 embedding_targets,
444 compute_similarity,
445 cluster,
446 compute_histograms,
447 })
448 }
449
450 pub fn needs_post_processing(&self) -> bool {
451 !self.embedding_targets.is_empty() || self.cluster
452 }
453}