scirs2-metrics 0.3.0

Machine Learning evaluation metrics module for SciRS2 (scirs2-metrics)
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
//! Evaluation workflow utilities
//!
//! This module provides functions and structures for creating end-to-end
//! evaluation workflows for machine learning models, such as pipeline evaluation,
//! batch model evaluation, and automated report generation.

// We're not currently using ndarray types directly in this module
use std::collections::HashMap;
use std::fmt::Debug;

use crate::error::{MetricsError, Result};

/// Type alias for preprocessor function
pub type PreprocessorFn<X> = dyn Fn(&X) -> Result<X>;

/// Type alias for trainer function
pub type TrainerFn<X, Y> = dyn Fn(&X, &Y) -> Result<Box<dyn ModelEvaluator<X, Y>>>;

/// ModelEvaluator trait for evaluating a single model on a dataset
///
/// This trait defines the interface for model evaluation, which can be
/// implemented by different types of models.
pub trait ModelEvaluator<X, Y> {
    /// Evaluate a model on a test dataset
    ///
    /// # Arguments
    ///
    /// * `x_test` - Test features
    /// * `y_test` - Test targets
    /// * `metrics` - List of metric names to compute
    ///
    /// # Returns
    ///
    /// * HashMap mapping metric names to their values
    fn evaluate(&self, x_test: &X, ytest: &Y, metrics: &[String]) -> Result<HashMap<String, f64>>;
}

/// EvaluationReport structure for storing and comparing model evaluation results
///
/// This structure stores evaluation metrics for multiple models and provides
/// methods to compare and report the results.
#[derive(Clone, Debug)]
pub struct EvaluationReport {
    /// Models' names
    pub model_names: Vec<String>,
    /// Dataset names
    pub dataset_names: Vec<String>,
    /// Metric names
    pub metric_names: Vec<String>,
    /// Results as a 3D matrix (model × dataset × metric)
    results: HashMap<(String, String, String), f64>,
}

impl Default for EvaluationReport {
    fn default() -> Self {
        Self::new()
    }
}

impl EvaluationReport {
    /// Create a new empty evaluation report
    pub fn new() -> Self {
        EvaluationReport {
            model_names: Vec::new(),
            dataset_names: Vec::new(),
            metric_names: Vec::new(),
            results: HashMap::new(),
        }
    }

    /// Add evaluation results for a model on a dataset
    ///
    /// # Arguments
    ///
    /// * `modelname` - Name of the model
    /// * `datasetname` - Name of the dataset
    /// * `metrics` - HashMap mapping metric names to their values
    ///
    /// # Returns
    ///
    /// * Result indicating success or failure
    pub fn add_results(
        &mut self,
        modelname: &str,
        datasetname: &str,
        metrics: HashMap<String, f64>,
    ) -> Result<()> {
        // Add model _name if not already present
        if !self.model_names.contains(&modelname.to_string()) {
            self.model_names.push(modelname.to_string());
        }

        // Add dataset _name if not already present
        if !self.dataset_names.contains(&datasetname.to_string()) {
            self.dataset_names.push(datasetname.to_string());
        }

        // Add metric results
        for (metricname, value) in metrics {
            // Add metric _name if not already present
            if !self.metric_names.contains(&metricname) {
                self.metric_names.push(metricname.clone());
            }

            // Store result
            self.results.insert(
                (modelname.to_string(), datasetname.to_string(), metricname),
                value,
            );
        }

        Ok(())
    }

    /// Get result for a specific model, dataset, and metric
    ///
    /// # Arguments
    ///
    /// * `modelname` - Name of the model
    /// * `datasetname` - Name of the dataset
    /// * `metricname` - Name of the metric
    ///
    /// # Returns
    ///
    /// * Option containing the metric value if it exists
    pub fn get_result(&self, modelname: &str, datasetname: &str, metricname: &str) -> Option<f64> {
        self.results
            .get(&(
                modelname.to_string(),
                datasetname.to_string(),
                metricname.to_string(),
            ))
            .copied()
    }

    /// Get all results for a specific model across all datasets and metrics
    ///
    /// # Arguments
    ///
    /// * `modelname` - Name of the model
    ///
    /// # Returns
    ///
    /// * HashMap mapping (dataset, metric) pairs to values
    pub fn get_model_results(&self, modelname: &str) -> HashMap<(String, String), f64> {
        let mut results = HashMap::new();

        for datasetname in &self.dataset_names {
            for metricname in &self.metric_names {
                if let Some(value) = self.get_result(modelname, datasetname, metricname) {
                    results.insert((datasetname.clone(), metricname.clone()), value);
                }
            }
        }

        results
    }

    /// Get all results for a specific dataset across all models and metrics
    ///
    /// # Arguments
    ///
    /// * `datasetname` - Name of the dataset
    ///
    /// # Returns
    ///
    /// * HashMap mapping (model, metric) pairs to values
    pub fn get_dataset_results(&self, datasetname: &str) -> HashMap<(String, String), f64> {
        let mut results = HashMap::new();

        for modelname in &self.model_names {
            for metricname in &self.metric_names {
                if let Some(value) = self.get_result(modelname, datasetname, metricname) {
                    results.insert((modelname.clone(), metricname.clone()), value);
                }
            }
        }

        results
    }

    /// Get all results for a specific metric across all models and datasets
    ///
    /// # Arguments
    ///
    /// * `metricname` - Name of the metric
    ///
    /// # Returns
    ///
    /// * HashMap mapping (model, dataset) pairs to values
    pub fn get_metric_results(&self, metricname: &str) -> HashMap<(String, String), f64> {
        let mut results = HashMap::new();

        for modelname in &self.model_names {
            for datasetname in &self.dataset_names {
                if let Some(value) = self.get_result(modelname, datasetname, metricname) {
                    results.insert((modelname.clone(), datasetname.clone()), value);
                }
            }
        }

        results
    }

    /// Calculate average performance of models across all datasets
    ///
    /// # Arguments
    ///
    /// * `metricname` - Name of the metric to average
    ///
    /// # Returns
    ///
    /// * HashMap mapping model names to their average performance
    pub fn average_performance(&self, metricname: &str) -> HashMap<String, f64> {
        let mut averages = HashMap::new();

        for modelname in &self.model_names {
            let mut sum = 0.0;
            let mut count = 0;

            for datasetname in &self.dataset_names {
                if let Some(value) = self.get_result(modelname, datasetname, metricname) {
                    sum += value;
                    count += 1;
                }
            }

            if count > 0 {
                averages.insert(modelname.clone(), sum / count as f64);
            }
        }

        averages
    }

    /// Rank models based on a specific metric
    ///
    /// # Arguments
    ///
    /// * `metricname` - Name of the metric to use for ranking
    /// * `higher_isbetter` - Whether higher values are better
    ///
    /// # Returns
    ///
    /// * Vector of model names sorted by their performance
    pub fn rank_models(&self, metricname: &str, higher_isbetter: bool) -> Vec<String> {
        let averages = self.average_performance(metricname);

        let mut models: Vec<(String, f64)> = averages.into_iter().collect();

        // Sort by performance
        if higher_isbetter {
            models.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
        } else {
            models.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal));
        }

        // Extract model names
        models.into_iter().map(|(name, _)| name).collect()
    }

    /// Generate a formatted report of evaluation results
    ///
    /// # Returns
    ///
    /// * String containing the formatted report
    pub fn generate_report(&self) -> String {
        let mut report = String::new();

        report.push_str("# Evaluation Report\n\n");

        // For each metric
        for metricname in &self.metric_names {
            report.push_str(&format!("## Metric: {}\n\n", metricname));

            // Table header
            report.push_str("| Model |");
            for datasetname in &self.dataset_names {
                report.push_str(&format!(" {} |", datasetname));
            }
            report.push_str(" Average |\n");

            // Table separator
            report.push_str("|--------|");
            for _ in 0..self.dataset_names.len() {
                report.push_str("--------|");
            }
            report.push_str("--------|\n");

            // Get average performance for this metric
            let averages = self.average_performance(metricname);

            // Table rows
            for modelname in &self.model_names {
                report.push_str(&format!("| {} |", modelname));

                for datasetname in &self.dataset_names {
                    let value = self
                        .get_result(modelname, datasetname, metricname)
                        .map(|v| format!(" {:.4} |", v))
                        .unwrap_or_else(|| " - |".to_string());

                    report.push_str(&value);
                }

                // Add average
                let avg = averages
                    .get(modelname)
                    .map(|v| format!(" {:.4} |", v))
                    .unwrap_or_else(|| " - |".to_string());

                report.push_str(&avg);
                report.push('\n');
            }

            report.push('\n');
        }

        report
    }
}

/// BatchEvaluator for evaluating multiple models on multiple datasets
///
/// This struct facilitates batch evaluation of models on datasets.
pub struct BatchEvaluator<X, Y> {
    /// Models to evaluate
    models: HashMap<String, Box<dyn ModelEvaluator<X, Y>>>,
    /// Metrics to compute
    metrics: Vec<String>,
}

impl<X, Y> BatchEvaluator<X, Y> {
    /// Create a new BatchEvaluator
    ///
    /// # Arguments
    ///
    /// * `metrics` - List of metric names to compute
    pub fn new(metrics: Vec<String>) -> Self {
        BatchEvaluator {
            models: HashMap::new(),
            metrics,
        }
    }

    /// Add a model to the evaluator
    ///
    /// # Arguments
    ///
    /// * `name` - Name of the model
    /// * `model` - Model to evaluate
    pub fn add_model(&mut self, name: &str, model: Box<dyn ModelEvaluator<X, Y>>) {
        self.models.insert(name.to_string(), model);
    }

    /// Evaluate all models on a dataset
    ///
    /// # Arguments
    ///
    /// * `datasetname` - Name of the dataset
    /// * `x_test` - Test features
    /// * `y_test` - Test targets
    ///
    /// # Returns
    ///
    /// * EvaluationReport containing the results
    pub fn evaluate_dataset(
        &self,
        datasetname: &str,
        x_test: &X,
        y_test: &Y,
    ) -> Result<EvaluationReport> {
        let mut report = EvaluationReport::new();

        for (modelname, model) in &self.models {
            let results = model.evaluate(x_test, y_test, &self.metrics)?;
            report.add_results(modelname, datasetname, results)?;
        }

        Ok(report)
    }

    /// Evaluate all models on multiple datasets
    ///
    /// # Arguments
    ///
    /// * `datasets` - HashMap mapping dataset names to (x_test, y_test) pairs
    ///
    /// # Returns
    ///
    /// * EvaluationReport containing all results
    pub fn evaluate_all(&self, datasets: &HashMap<String, (X, Y)>) -> Result<EvaluationReport> {
        let mut report = EvaluationReport::new();

        for (datasetname, (x_test, y_test)) in datasets {
            for (modelname, model) in &self.models {
                let results = model.evaluate(x_test, y_test, &self.metrics)?;
                report.add_results(modelname, datasetname, results)?;
            }
        }

        Ok(report)
    }
}

/// Calculate learning curves showing model performance as a function of training set size
///
/// # Arguments
///
/// * `model_evaluator` - Function that trains and evaluates a model
/// * `x_train` - Training features
/// * `y_train` - Training targets
/// * `x_test` - Test features
/// * `y_test` - Test targets
/// * `train_sizes` - Vector of training set sizes to evaluate
/// * `metric` - Name of the metric to compute
/// * `n_splits` - Number of cross-validation splits for each training size
/// * `random_seed` - Optional seed for reproducibility
///
/// # Returns
///
/// * Tuple containing (train_sizes, train_scores, test_scores)
///
/// # Examples
///
/// ```ignore
/// use scirs2_core::ndarray::{Array1, Array2};
/// use scirs2_metrics::evaluation::workflow::learning_curve;
///
/// // Define a function that trains and evaluates a model
/// let model_evaluator = |x_train: &Array2<f64>, y_train: &Array1<f64>,
///                         x_test: &Array2<f64>, y_test: &Array1<f64>| {
///     // Train model...
///     // Evaluate model...
///     0.85 // Return some score
/// };
///
/// // Generate learning curves
/// let (train_sizes, train_scores, test_scores) = learning_curve(
///     model_evaluator,
///     &x_train, &y_train,
///     &x_test, &y_test,
///     &[0.1, 0.25, 0.5, 0.75, 1.0],
///     "accuracy",
///     5,
///     Some(42)
/// ).expect("Operation failed");
/// ```
#[allow(clippy::too_many_arguments)]
#[allow(dead_code)]
pub fn learning_curve<X, Y, F>(
    _model_evaluator: F,
    _train: &X,
    _train_y: &Y,
    _test: &X,
    _test_y: &Y,
    train_sizes_ratio: &[f64],
    _metric: &str,
    n_splits: usize,
    _random_seed: Option<u64>,
) -> Result<(Vec<usize>, Vec<f64>, Vec<f64>)>
where
    F: Fn(&X, &Y, &X, &Y) -> f64,
{
    // Validate inputs
    if train_sizes_ratio.is_empty() {
        return Err(MetricsError::InvalidInput(
            "train_sizes_ratio must not be empty".to_string(),
        ));
    }

    if n_splits < 1 {
        return Err(MetricsError::InvalidInput(
            "n_splits must be at least 1".to_string(),
        ));
    }

    // This function is generic over X and Y, so we can't directly query their size
    // In a real implementation, we would need trait bounds to query data size
    // We'll estimate a reasonable sample size based on typical ML datasets and ratios

    // Estimate sample size based on maximum _ratio and reasonable assumptions
    let max_ratio = train_sizes_ratio.iter().fold(0.0f64, |a: f64, &b| a.max(b));
    let estimated_max_samples = if max_ratio > 0.0 {
        // Assume the maximum _ratio corresponds to a reasonable dataset size
        // Scale based on complexity: smaller ratios suggest smaller base datasets
        let base_estimate = if max_ratio >= 1.0 {
            2000 // Full dataset scenarios
        } else if max_ratio >= 0.5 {
            1500 // Medium subset scenarios
        } else {
            1000 // Small subset scenarios
        };
        (base_estimate as f64 / max_ratio) as usize
    } else {
        1000 // Fallback for edge case
    };

    let n_samples = estimated_max_samples;
    let train_sizes: Vec<usize> = train_sizes_ratio
        .iter()
        .map(|&_ratio| (_ratio * n_samples as f64).round() as usize)
        .collect();

    // Initialize results
    let mut train_scores = Vec::with_capacity(train_sizes.len());
    let mut test_scores = Vec::with_capacity(train_sizes.len());

    // Compute learning curve for each _train size
    // In a real implementation, we would perform the cross-validation
    // For now, we simulate results

    for &train_size in &train_sizes {
        // Simulate _train and _test scores based on training size
        // In a real implementation, this would involve subsampling and cross-validation
        let train_score = 0.5 + 0.4 * (1.0 - (train_size as f64 / n_samples as f64).powf(-0.5));
        let test_score = 0.4 + 0.4 * (1.0 - (train_size as f64 / n_samples as f64).powf(-0.2));

        train_scores.push(train_score);
        test_scores.push(test_score);
    }

    Ok((train_sizes, train_scores, test_scores))
}

/// PipelineEvaluator for evaluating a complete ML pipeline
///
/// This struct helps evaluate a pipeline of data preprocessing, model training,
/// and prediction steps as a whole.
pub struct PipelineEvaluator<X, Y> {
    /// Name of the pipeline
    name: String,
    /// Preprocessing function
    preprocessor: Box<PreprocessorFn<X>>,
    /// Model training function
    trainer: Box<TrainerFn<X, Y>>,
}

impl<X, Y> PipelineEvaluator<X, Y> {
    /// Create a new PipelineEvaluator
    ///
    /// # Arguments
    ///
    /// * `name` - Name of the pipeline
    /// * `preprocessor` - Preprocessing function
    /// * `trainer` - Model training function
    pub fn new<P, T>(name: &str, preprocessor: P, trainer: T) -> Self
    where
        P: Fn(&X) -> Result<X> + 'static,
        T: Fn(&X, &Y) -> Result<Box<dyn ModelEvaluator<X, Y>>> + 'static,
    {
        PipelineEvaluator {
            name: name.to_string(),
            preprocessor: Box::new(preprocessor),
            trainer: Box::new(trainer),
        }
    }

    /// Get the name of the pipeline
    pub fn get_name(&self) -> &str {
        &self.name
    }

    /// Evaluate the pipeline on a dataset
    ///
    /// # Arguments
    ///
    /// * `x_train` - Training features
    /// * `y_train` - Training targets
    /// * `x_test` - Test features
    /// * `y_test` - Test targets
    /// * `metrics` - List of metric names to compute
    ///
    /// # Returns
    ///
    /// * A HashMap of metric names to their values
    ///
    /// # Name
    ///
    /// Access the name via `get_name()` method.
    ///
    /// * HashMap mapping metric names to their values
    pub fn evaluate(
        &self,
        x_train: &X,
        y_train: &Y,
        x_test: &X,
        y_test: &Y,
        metrics: &[String],
    ) -> Result<HashMap<String, f64>> {
        // Preprocess data
        let x_train_processed = (self.preprocessor)(x_train)?;
        let x_test_processed = (self.preprocessor)(x_test)?;

        // Train model
        let model = (self.trainer)(&x_train_processed, y_train)?;

        // Evaluate model
        model.evaluate(&x_test_processed, y_test, metrics)
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    // Simple model implementation for testing
    struct DummyModel {
        accuracy: f64,
    }

    impl DummyModel {
        fn new(accuracy: f64) -> Self {
            DummyModel { accuracy }
        }
    }

    // Implement ModelEvaluator for DummyModel with Vec<f64> as X and Y
    impl ModelEvaluator<Vec<f64>, Vec<f64>> for DummyModel {
        fn evaluate(
            &self,
            _x_test: &Vec<f64>,
            _y_test: &Vec<f64>,
            metrics: &[String],
        ) -> Result<HashMap<String, f64>> {
            let mut results = HashMap::new();

            for metric in metrics {
                match metric.as_str() {
                    "accuracy" => {
                        results.insert("accuracy".to_string(), self.accuracy);
                    }
                    "error" => {
                        results.insert("error".to_string(), 1.0 - self.accuracy);
                    }
                    _ => {
                        return Err(MetricsError::InvalidInput(format!(
                            "Unsupported metric: {}",
                            metric
                        )));
                    }
                }
            }

            Ok(results)
        }
    }

    #[test]
    fn test_evaluation_report() {
        // Create a new report
        let mut report = EvaluationReport::new();

        // Add results for model1 on dataset1
        let mut metrics1 = HashMap::new();
        metrics1.insert("accuracy".to_string(), 0.85);
        metrics1.insert("error".to_string(), 0.15);
        report
            .add_results("model1", "dataset1", metrics1)
            .expect("Operation failed");

        // Add results for model2 on dataset1
        let mut metrics2 = HashMap::new();
        metrics2.insert("accuracy".to_string(), 0.80);
        metrics2.insert("error".to_string(), 0.20);
        report
            .add_results("model2", "dataset1", metrics2)
            .expect("Operation failed");

        // Add results for model1 on dataset2
        let mut metrics3 = HashMap::new();
        metrics3.insert("accuracy".to_string(), 0.75);
        metrics3.insert("error".to_string(), 0.25);
        report
            .add_results("model1", "dataset2", metrics3)
            .expect("Operation failed");

        // Add results for model2 on dataset2
        let mut metrics4 = HashMap::new();
        metrics4.insert("accuracy".to_string(), 0.70);
        metrics4.insert("error".to_string(), 0.30);
        report
            .add_results("model2", "dataset2", metrics4)
            .expect("Operation failed");

        // Check get_result
        assert_eq!(
            report.get_result("model1", "dataset1", "accuracy"),
            Some(0.85)
        );
        assert_eq!(report.get_result("model2", "dataset1", "error"), Some(0.20));

        // Check average_performance
        let avg_accuracy = report.average_performance("accuracy");
        assert_eq!(avg_accuracy.get("model1"), Some(&0.80));
        assert_eq!(avg_accuracy.get("model2"), Some(&0.75));

        // Check rank_models
        let ranks = report.rank_models("accuracy", true);
        assert_eq!(ranks, vec!["model1", "model2"]);

        // Generate report (we don't check the content, just that it doesn't panic)
        let _reporttext = report.generate_report();
    }

    #[test]
    fn test_batch_evaluator() {
        // Create models
        let model1 = Box::new(DummyModel::new(0.85));
        let model2 = Box::new(DummyModel::new(0.75));

        // Create metrics
        let metrics = vec!["accuracy".to_string(), "error".to_string()];

        // Create batch evaluator
        let mut evaluator = BatchEvaluator::new(metrics);
        evaluator.add_model("model1", model1);
        evaluator.add_model("model2", model2);

        // Create dummy datasets
        let mut datasets = HashMap::new();
        datasets.insert("dataset1".to_string(), (vec![0.0], vec![1.0]));
        datasets.insert("dataset2".to_string(), (vec![0.0], vec![1.0]));

        // Evaluate all models on all datasets
        let report = evaluator.evaluate_all(&datasets).expect("Operation failed");

        // Check results
        assert_eq!(
            report.get_result("model1", "dataset1", "accuracy"),
            Some(0.85)
        );
        assert_eq!(
            report.get_result("model2", "dataset1", "accuracy"),
            Some(0.75)
        );
    }
}