1use crate::{CrossValidator, ParameterValue};
4use scirs2_core::ndarray::{Array1, Array2};
5use sklears_core::{
7 error::Result,
8 prelude::{Predict, SklearsError},
9 traits::Fit,
10 traits::Score,
11 types::Float,
12};
13use sklears_metrics::{
14 classification::accuracy_score, get_scorer, regression::mean_squared_error, Scorer,
15};
16use std::collections::HashMap;
17
18fn compute_score_for_regression_val(
20 metric_name: &str,
21 y_true: &Array1<f64>,
22 y_pred: &Array1<f64>,
23) -> Result<f64> {
24 match metric_name {
25 "neg_mean_squared_error" => Ok(-mean_squared_error(y_true, y_pred)?),
26 "mean_squared_error" => Ok(mean_squared_error(y_true, y_pred)?),
27 _ => {
28 Err(sklears_core::error::SklearsError::InvalidInput(format!(
30 "Metric '{}' not supported for regression",
31 metric_name
32 )))
33 }
34 }
35}
36
37fn compute_score_for_classification_val(
39 metric_name: &str,
40 y_true: &Array1<i32>,
41 y_pred: &Array1<i32>,
42) -> Result<f64> {
43 match metric_name {
44 "accuracy" => Ok(accuracy_score(y_true, y_pred)?),
45 _ => {
46 let scorer = get_scorer(metric_name)?;
47 scorer.score(
48 y_true.as_slice().expect("operation should succeed"),
49 y_pred.as_slice().expect("operation should succeed"),
50 )
51 }
52 }
53}
54
55#[derive(Debug, Clone)]
57pub enum Scoring {
58 EstimatorScore,
60 Metric(String),
62 Scorer(Scorer),
64 MultiMetric(Vec<String>),
66 Custom(fn(&Array1<Float>, &Array1<Float>) -> Result<f64>),
68}
69
70#[derive(Debug, Clone)]
72pub enum ScoreResult {
73 Single(f64),
75 Multiple(HashMap<String, f64>),
77}
78
79impl ScoreResult {
80 pub fn as_single(&self) -> f64 {
82 match self {
83 ScoreResult::Single(score) => *score,
84 ScoreResult::Multiple(scores) => scores.values().next().copied().unwrap_or(0.0),
85 }
86 }
87
88 pub fn as_multiple(&self) -> HashMap<String, f64> {
90 match self {
91 ScoreResult::Single(score) => {
92 let mut map = HashMap::new();
93 map.insert("score".to_string(), *score);
94 map
95 }
96 ScoreResult::Multiple(scores) => scores.clone(),
97 }
98 }
99}
100
101#[allow(clippy::too_many_arguments)]
103pub fn cross_validate<E, F, C>(
104 estimator: E,
105 x: &Array2<Float>,
106 y: &Array1<Float>,
107 cv: &C,
108 scoring: Scoring,
109 return_train_score: bool,
110 return_estimator: bool,
111 _n_jobs: Option<usize>,
112) -> Result<CrossValidateResult<F>>
113where
114 E: Clone,
115 F: Clone,
116 E: Fit<Array2<Float>, Array1<Float>, Fitted = F>,
117 F: Predict<Array2<Float>, Array1<Float>>,
118 F: Score<Array2<Float>, Array1<Float>, Float = f64>,
119 C: CrossValidator,
120{
121 let splits = cv.split(x.nrows(), None);
124 let n_splits = splits.len();
125
126 let mut test_scores = Vec::with_capacity(n_splits);
127 let mut train_scores = if return_train_score {
128 Some(Vec::with_capacity(n_splits))
129 } else {
130 None
131 };
132 let mut fit_times = Vec::with_capacity(n_splits);
133 let mut score_times = Vec::with_capacity(n_splits);
134 let mut estimators = if return_estimator {
135 Some(Vec::with_capacity(n_splits))
136 } else {
137 None
138 };
139
140 for (train_idx, test_idx) in splits {
142 let x_train = x.select(scirs2_core::ndarray::Axis(0), &train_idx);
144 let y_train = y.select(scirs2_core::ndarray::Axis(0), &train_idx);
145 let x_test = x.select(scirs2_core::ndarray::Axis(0), &test_idx);
146 let y_test = y.select(scirs2_core::ndarray::Axis(0), &test_idx);
147
148 let start = std::time::Instant::now();
150 let fitted = estimator.clone().fit(&x_train, &y_train)?;
151 let fit_time = start.elapsed().as_secs_f64();
152 fit_times.push(fit_time);
153
154 let start = std::time::Instant::now();
156 let test_score = match &scoring {
157 Scoring::EstimatorScore => fitted.score(&x_test, &y_test)?,
158 Scoring::Custom(func) => {
159 let y_pred = fitted.predict(&x_test)?;
160 func(&y_test.to_owned(), &y_pred)?
161 }
162 Scoring::Metric(metric_name) => {
163 let y_pred = fitted.predict(&x_test)?;
164 if y_test.iter().all(|&x| x.fract() == 0.0) {
166 let y_true_int: Array1<i32> = y_test.mapv(|x| x as i32);
168 let y_pred_int: Array1<i32> = y_pred.mapv(|x| x as i32);
169 compute_score_for_classification_val(metric_name, &y_true_int, &y_pred_int)?
170 } else {
171 compute_score_for_regression_val(metric_name, &y_test, &y_pred)?
173 }
174 }
175 Scoring::Scorer(scorer) => {
176 let y_pred = fitted.predict(&x_test)?;
177 scorer.score_float(
178 y_test.as_slice().expect("operation should succeed"),
179 y_pred.as_slice().expect("operation should succeed"),
180 )?
181 }
182 Scoring::MultiMetric(_) => {
183 return Err(SklearsError::InvalidInput(
184 "MultiMetric scoring not supported in single metric context".to_string(),
185 ));
186 }
187 };
188 let score_time = start.elapsed().as_secs_f64();
189 score_times.push(score_time);
190 test_scores.push(test_score);
191
192 if let Some(ref mut train_scores) = train_scores {
194 let train_score = match &scoring {
195 Scoring::EstimatorScore => fitted.score(&x_train, &y_train)?,
196 Scoring::Custom(func) => {
197 let y_pred = fitted.predict(&x_train)?;
198 func(&y_train.to_owned(), &y_pred)?
199 }
200 Scoring::Metric(metric_name) => {
201 let y_pred = fitted.predict(&x_train)?;
202 if y_train.iter().all(|&x| x.fract() == 0.0) {
204 let y_true_int: Array1<i32> = y_train.mapv(|x| x as i32);
206 let y_pred_int: Array1<i32> = y_pred.mapv(|x| x as i32);
207 compute_score_for_classification_val(metric_name, &y_true_int, &y_pred_int)?
208 } else {
209 compute_score_for_regression_val(metric_name, &y_train, &y_pred)?
211 }
212 }
213 Scoring::Scorer(scorer) => {
214 let y_pred = fitted.predict(&x_train)?;
215 scorer.score_float(
216 y_train.as_slice().expect("operation should succeed"),
217 y_pred.as_slice().expect("operation should succeed"),
218 )?
219 }
220 Scoring::MultiMetric(_metrics) => {
221 fitted.score(&x_train, &y_train)?
223 }
224 };
225 train_scores.push(train_score);
226 }
227
228 if let Some(ref mut estimators) = estimators {
230 estimators.push(fitted);
231 }
232 }
233
234 Ok(CrossValidateResult {
235 test_scores: Array1::from_vec(test_scores),
236 train_scores: train_scores.map(Array1::from_vec),
237 fit_times: Array1::from_vec(fit_times),
238 score_times: Array1::from_vec(score_times),
239 estimators,
240 })
241}
242
243#[derive(Debug, Clone)]
245pub struct CrossValidateResult<F> {
246 pub test_scores: Array1<f64>,
247 pub train_scores: Option<Array1<f64>>,
248 pub fit_times: Array1<f64>,
249 pub score_times: Array1<f64>,
250 pub estimators: Option<Vec<F>>,
251}
252
253pub fn cross_val_score<E, F, C>(
255 estimator: E,
256 x: &Array2<Float>,
257 y: &Array1<Float>,
258 cv: &C,
259 scoring: Option<Scoring>,
260 n_jobs: Option<usize>,
261) -> Result<Array1<f64>>
262where
263 E: Clone,
264 F: Clone,
265 E: Fit<Array2<Float>, Array1<Float>, Fitted = F>,
266 F: Predict<Array2<Float>, Array1<Float>>,
267 F: Score<Array2<Float>, Array1<Float>, Float = f64>,
268 C: CrossValidator,
269{
270 let scoring = scoring.unwrap_or(Scoring::EstimatorScore);
271 let result = cross_validate(
272 estimator, x, y, cv, scoring, false, false, n_jobs,
275 )?;
276
277 Ok(result.test_scores)
278}
279
280pub fn cross_val_predict<E, F, C>(
282 estimator: E,
283 x: &Array2<Float>,
284 y: &Array1<Float>,
285 cv: &C,
286 _n_jobs: Option<usize>,
287) -> Result<Array1<Float>>
288where
289 E: Clone,
290 E: Fit<Array2<Float>, Array1<Float>, Fitted = F>,
291 F: Predict<Array2<Float>, Array1<Float>>,
292 C: CrossValidator,
293{
294 let splits = cv.split(x.nrows(), None);
297 let n_samples = x.nrows();
298
299 let mut predictions = Array1::<Float>::zeros(n_samples);
301
302 for (train_idx, test_idx) in splits {
304 let x_train = x.select(scirs2_core::ndarray::Axis(0), &train_idx);
306 let y_train = y.select(scirs2_core::ndarray::Axis(0), &train_idx);
307 let x_test = x.select(scirs2_core::ndarray::Axis(0), &test_idx);
308
309 let fitted = estimator.clone().fit(&x_train, &y_train)?;
311 let y_pred = fitted.predict(&x_test)?;
312
313 for (i, &idx) in test_idx.iter().enumerate() {
315 predictions[idx] = y_pred[i];
316 }
317 }
318
319 Ok(predictions)
320}
321
322#[derive(Debug, Clone)]
324pub struct LearningCurveResult {
325 pub train_sizes: Array1<usize>,
327 pub train_scores: Array2<f64>,
329 pub test_scores: Array2<f64>,
331 pub train_scores_mean: Array1<f64>,
333 pub test_scores_mean: Array1<f64>,
335 pub train_scores_std: Array1<f64>,
337 pub test_scores_std: Array1<f64>,
339 pub train_scores_lower: Array1<f64>,
341 pub train_scores_upper: Array1<f64>,
343 pub test_scores_lower: Array1<f64>,
345 pub test_scores_upper: Array1<f64>,
347}
348
349#[allow(clippy::too_many_arguments)]
364pub fn learning_curve<E, F, C>(
365 estimator: E,
366 x: &Array2<Float>,
367 y: &Array1<Float>,
368 cv: &C,
369 train_sizes: Option<Vec<f64>>,
370 scoring: Option<Scoring>,
371 confidence_level: Option<f64>,
372) -> Result<LearningCurveResult>
373where
374 E: Clone,
375 F: Clone,
376 E: Fit<Array2<Float>, Array1<Float>, Fitted = F>,
377 F: Predict<Array2<Float>, Array1<Float>>,
378 F: Score<Array2<Float>, Array1<Float>, Float = f64>,
379 C: CrossValidator,
380{
381 let n_samples = x.nrows();
382 let scoring = scoring.unwrap_or(Scoring::EstimatorScore);
383
384 let train_size_fractions = train_sizes.unwrap_or_else(|| vec![0.1, 0.3, 0.5, 0.7, 0.9, 1.0]);
386
387 let train_sizes_actual: Vec<usize> = train_size_fractions
389 .iter()
390 .map(|&frac| {
391 let size = (frac * n_samples as f64).round() as usize;
392 size.max(1).min(n_samples) })
394 .collect();
395
396 let n_splits = cv.n_splits();
397 let n_train_sizes = train_sizes_actual.len();
398
399 let mut train_scores = Array2::<f64>::zeros((n_train_sizes, n_splits));
400 let mut test_scores = Array2::<f64>::zeros((n_train_sizes, n_splits));
401
402 let splits = cv.split(x.nrows(), None);
404
405 for (size_idx, &train_size) in train_sizes_actual.iter().enumerate() {
406 for (split_idx, (train_idx, test_idx)) in splits.iter().enumerate() {
407 let mut limited_train_idx = train_idx.clone();
409 if limited_train_idx.len() > train_size {
410 limited_train_idx.truncate(train_size);
411 }
412
413 let x_train = x.select(scirs2_core::ndarray::Axis(0), &limited_train_idx);
415 let y_train = y.select(scirs2_core::ndarray::Axis(0), &limited_train_idx);
416 let x_test = x.select(scirs2_core::ndarray::Axis(0), test_idx);
417 let y_test = y.select(scirs2_core::ndarray::Axis(0), test_idx);
418
419 let fitted = estimator.clone().fit(&x_train, &y_train)?;
421
422 let train_score = match &scoring {
424 Scoring::EstimatorScore => fitted.score(&x_train, &y_train)?,
425 Scoring::Custom(func) => {
426 let y_pred = fitted.predict(&x_train)?;
427 func(&y_train.to_owned(), &y_pred)?
428 }
429 Scoring::Metric(metric_name) => {
430 let y_pred = fitted.predict(&x_train)?;
431 if y_train.iter().all(|&x| x.fract() == 0.0) {
433 let y_true_int: Array1<i32> = y_train.mapv(|x| x as i32);
435 let y_pred_int: Array1<i32> = y_pred.mapv(|x| x as i32);
436 compute_score_for_classification_val(metric_name, &y_true_int, &y_pred_int)?
437 } else {
438 compute_score_for_regression_val(metric_name, &y_train, &y_pred)?
440 }
441 }
442 Scoring::Scorer(scorer) => {
443 let y_pred = fitted.predict(&x_train)?;
444 scorer.score_float(
445 y_train.as_slice().expect("operation should succeed"),
446 y_pred.as_slice().expect("operation should succeed"),
447 )?
448 }
449 Scoring::MultiMetric(_metrics) => {
450 fitted.score(&x_train, &y_train)?
452 }
453 };
454 train_scores[[size_idx, split_idx]] = train_score;
455
456 let test_score = match &scoring {
458 Scoring::EstimatorScore => fitted.score(&x_test, &y_test)?,
459 Scoring::Custom(func) => {
460 let y_pred = fitted.predict(&x_test)?;
461 func(&y_test.to_owned(), &y_pred)?
462 }
463 Scoring::Metric(metric_name) => {
464 let y_pred = fitted.predict(&x_test)?;
465 if y_test.iter().all(|&x| x.fract() == 0.0) {
467 let y_true_int: Array1<i32> = y_test.mapv(|x| x as i32);
469 let y_pred_int: Array1<i32> = y_pred.mapv(|x| x as i32);
470 compute_score_for_classification_val(metric_name, &y_true_int, &y_pred_int)?
471 } else {
472 compute_score_for_regression_val(metric_name, &y_test, &y_pred)?
474 }
475 }
476 Scoring::Scorer(scorer) => {
477 let y_pred = fitted.predict(&x_test)?;
478 scorer.score_float(
479 y_test.as_slice().expect("operation should succeed"),
480 y_pred.as_slice().expect("operation should succeed"),
481 )?
482 }
483 Scoring::MultiMetric(_metrics) => {
484 fitted.score(&x_test, &y_test)?
486 }
487 };
488 test_scores[[size_idx, split_idx]] = test_score;
489 }
490 }
491
492 let confidence = confidence_level.unwrap_or(0.95);
494 let _alpha = 1.0 - confidence;
495 let z_score = 1.96; let mut train_scores_mean = Array1::<f64>::zeros(n_train_sizes);
499 let mut test_scores_mean = Array1::<f64>::zeros(n_train_sizes);
500 let mut train_scores_std = Array1::<f64>::zeros(n_train_sizes);
501 let mut test_scores_std = Array1::<f64>::zeros(n_train_sizes);
502 let mut train_scores_lower = Array1::<f64>::zeros(n_train_sizes);
503 let mut train_scores_upper = Array1::<f64>::zeros(n_train_sizes);
504 let mut test_scores_lower = Array1::<f64>::zeros(n_train_sizes);
505 let mut test_scores_upper = Array1::<f64>::zeros(n_train_sizes);
506
507 for size_idx in 0..n_train_sizes {
508 let train_scores_for_size: Vec<f64> = (0..n_splits)
510 .map(|split_idx| train_scores[[size_idx, split_idx]])
511 .collect();
512 let test_scores_for_size: Vec<f64> = (0..n_splits)
513 .map(|split_idx| test_scores[[size_idx, split_idx]])
514 .collect();
515
516 let train_mean = train_scores_for_size.iter().sum::<f64>() / n_splits as f64;
518 let train_variance = train_scores_for_size
519 .iter()
520 .map(|&x| (x - train_mean).powi(2))
521 .sum::<f64>()
522 / (n_splits - 1).max(1) as f64;
523 let train_std = train_variance.sqrt();
524 let train_sem = train_std / (n_splits as f64).sqrt(); let test_mean = test_scores_for_size.iter().sum::<f64>() / n_splits as f64;
528 let test_variance = test_scores_for_size
529 .iter()
530 .map(|&x| (x - test_mean).powi(2))
531 .sum::<f64>()
532 / (n_splits - 1).max(1) as f64;
533 let test_std = test_variance.sqrt();
534 let test_sem = test_std / (n_splits as f64).sqrt(); let train_margin = z_score * train_sem;
538 let test_margin = z_score * test_sem;
539
540 train_scores_mean[size_idx] = train_mean;
541 test_scores_mean[size_idx] = test_mean;
542 train_scores_std[size_idx] = train_std;
543 test_scores_std[size_idx] = test_std;
544 train_scores_lower[size_idx] = train_mean - train_margin;
545 train_scores_upper[size_idx] = train_mean + train_margin;
546 test_scores_lower[size_idx] = test_mean - test_margin;
547 test_scores_upper[size_idx] = test_mean + test_margin;
548 }
549
550 Ok(LearningCurveResult {
551 train_sizes: Array1::from_vec(train_sizes_actual),
552 train_scores,
553 test_scores,
554 train_scores_mean,
555 test_scores_mean,
556 train_scores_std,
557 test_scores_std,
558 train_scores_lower,
559 train_scores_upper,
560 test_scores_lower,
561 test_scores_upper,
562 })
563}
564
565#[derive(Debug, Clone)]
567pub struct ValidationCurveResult {
568 pub param_values: Vec<ParameterValue>,
570 pub train_scores: Array2<f64>,
572 pub test_scores: Array2<f64>,
574 pub train_scores_mean: Array1<f64>,
576 pub test_scores_mean: Array1<f64>,
578 pub train_scores_std: Array1<f64>,
580 pub test_scores_std: Array1<f64>,
582 pub train_scores_lower: Array1<f64>,
584 pub train_scores_upper: Array1<f64>,
586 pub test_scores_lower: Array1<f64>,
588 pub test_scores_upper: Array1<f64>,
590}
591
592pub type ParamConfigFn<E> = Box<dyn Fn(E, &ParameterValue) -> Result<E>>;
594
595#[allow(clippy::too_many_arguments)]
612pub fn validation_curve<E, F, C>(
613 estimator: E,
614 x: &Array2<Float>,
615 y: &Array1<Float>,
616 _param_name: &str,
617 param_range: Vec<ParameterValue>,
618 param_config: ParamConfigFn<E>,
619 cv: &C,
620 scoring: Option<Scoring>,
621 confidence_level: Option<f64>,
622) -> Result<ValidationCurveResult>
623where
624 E: Clone,
625 F: Clone,
626 E: Fit<Array2<Float>, Array1<Float>, Fitted = F>,
627 F: Predict<Array2<Float>, Array1<Float>>,
628 F: Score<Array2<Float>, Array1<Float>, Float = f64>,
629 C: CrossValidator,
630{
631 let scoring = scoring.unwrap_or(Scoring::EstimatorScore);
632 let n_splits = cv.n_splits();
633 let n_params = param_range.len();
634
635 let mut train_scores = Array2::<f64>::zeros((n_params, n_splits));
636 let mut test_scores = Array2::<f64>::zeros((n_params, n_splits));
637
638 let splits = cv.split(x.nrows(), None);
640
641 for (param_idx, param_value) in param_range.iter().enumerate() {
642 for (split_idx, (train_idx, test_idx)) in splits.iter().enumerate() {
643 let x_train = x.select(scirs2_core::ndarray::Axis(0), train_idx);
645 let y_train = y.select(scirs2_core::ndarray::Axis(0), train_idx);
646 let x_test = x.select(scirs2_core::ndarray::Axis(0), test_idx);
647 let y_test = y.select(scirs2_core::ndarray::Axis(0), test_idx);
648
649 let configured_estimator = param_config(estimator.clone(), param_value)?;
651
652 let fitted = configured_estimator.fit(&x_train, &y_train)?;
654
655 let train_score = match &scoring {
657 Scoring::EstimatorScore => fitted.score(&x_train, &y_train)?,
658 Scoring::Custom(func) => {
659 let y_pred = fitted.predict(&x_train)?;
660 func(&y_train.to_owned(), &y_pred)?
661 }
662 Scoring::Metric(metric_name) => {
663 let y_pred = fitted.predict(&x_train)?;
664 if y_train.iter().all(|&x| x.fract() == 0.0) {
666 let y_true_int: Array1<i32> = y_train.mapv(|x| x as i32);
668 let y_pred_int: Array1<i32> = y_pred.mapv(|x| x as i32);
669 compute_score_for_classification_val(metric_name, &y_true_int, &y_pred_int)?
670 } else {
671 compute_score_for_regression_val(metric_name, &y_train, &y_pred)?
673 }
674 }
675 Scoring::Scorer(scorer) => {
676 let y_pred = fitted.predict(&x_train)?;
677 scorer.score_float(
678 y_train.as_slice().expect("operation should succeed"),
679 y_pred.as_slice().expect("operation should succeed"),
680 )?
681 }
682 Scoring::MultiMetric(_metrics) => {
683 fitted.score(&x_train, &y_train)?
685 }
686 };
687 train_scores[[param_idx, split_idx]] = train_score;
688
689 let test_score = match &scoring {
691 Scoring::EstimatorScore => fitted.score(&x_test, &y_test)?,
692 Scoring::Custom(func) => {
693 let y_pred = fitted.predict(&x_test)?;
694 func(&y_test.to_owned(), &y_pred)?
695 }
696 Scoring::Metric(metric_name) => {
697 let y_pred = fitted.predict(&x_test)?;
698 if y_test.iter().all(|&x| x.fract() == 0.0) {
700 let y_true_int: Array1<i32> = y_test.mapv(|x| x as i32);
702 let y_pred_int: Array1<i32> = y_pred.mapv(|x| x as i32);
703 compute_score_for_classification_val(metric_name, &y_true_int, &y_pred_int)?
704 } else {
705 compute_score_for_regression_val(metric_name, &y_test, &y_pred)?
707 }
708 }
709 Scoring::Scorer(scorer) => {
710 let y_pred = fitted.predict(&x_test)?;
711 scorer.score_float(
712 y_test.as_slice().expect("operation should succeed"),
713 y_pred.as_slice().expect("operation should succeed"),
714 )?
715 }
716 Scoring::MultiMetric(_metrics) => {
717 fitted.score(&x_test, &y_test)?
719 }
720 };
721 test_scores[[param_idx, split_idx]] = test_score;
722 }
723 }
724
725 let _confidence = confidence_level.unwrap_or(0.95);
727 let _z_score = 1.96; let mut train_scores_mean = Array1::<f64>::zeros(n_params);
731 let mut test_scores_mean = Array1::<f64>::zeros(n_params);
732 let mut train_scores_std = Array1::<f64>::zeros(n_params);
733 let mut test_scores_std = Array1::<f64>::zeros(n_params);
734 let mut train_scores_lower = Array1::<f64>::zeros(n_params);
735 let mut train_scores_upper = Array1::<f64>::zeros(n_params);
736 let mut test_scores_lower = Array1::<f64>::zeros(n_params);
737 let mut test_scores_upper = Array1::<f64>::zeros(n_params);
738
739 for param_idx in 0..n_params {
740 let train_scores_for_param: Vec<f64> = (0..n_splits)
742 .map(|split_idx| train_scores[[param_idx, split_idx]])
743 .collect();
744 let test_scores_for_param: Vec<f64> = (0..n_splits)
745 .map(|split_idx| test_scores[[param_idx, split_idx]])
746 .collect();
747
748 let train_mean = train_scores_for_param.iter().sum::<f64>() / n_splits as f64;
750 let train_variance = train_scores_for_param
751 .iter()
752 .map(|&x| (x - train_mean).powi(2))
753 .sum::<f64>()
754 / (n_splits - 1).max(1) as f64;
755 let train_std = train_variance.sqrt();
756 let train_sem = train_std / (n_splits as f64).sqrt(); let test_mean = test_scores_for_param.iter().sum::<f64>() / n_splits as f64;
760 let test_variance = test_scores_for_param
761 .iter()
762 .map(|&x| (x - test_mean).powi(2))
763 .sum::<f64>()
764 / (n_splits - 1).max(1) as f64;
765 let test_std = test_variance.sqrt();
766 let test_sem = test_std / (n_splits as f64).sqrt(); let train_margin = train_sem;
770 let test_margin = test_sem;
771
772 train_scores_mean[param_idx] = train_mean;
773 test_scores_mean[param_idx] = test_mean;
774 train_scores_std[param_idx] = train_std;
775 test_scores_std[param_idx] = test_std;
776 train_scores_lower[param_idx] = train_mean - train_margin;
777 train_scores_upper[param_idx] = train_mean + train_margin;
778 test_scores_lower[param_idx] = test_mean - test_margin;
779 test_scores_upper[param_idx] = test_mean + test_margin;
780 }
781
782 Ok(ValidationCurveResult {
783 param_values: param_range,
784 train_scores,
785 test_scores,
786 train_scores_mean,
787 test_scores_mean,
788 train_scores_std,
789 test_scores_std,
790 train_scores_lower,
791 train_scores_upper,
792 test_scores_lower,
793 test_scores_upper,
794 })
795}
796
797#[allow(clippy::too_many_arguments)]
802pub fn permutation_test_score<E, F, C>(
803 estimator: E,
804 x: &Array2<Float>,
805 y: &Array1<Float>,
806 cv: &C,
807 scoring: Option<Scoring>,
808 n_permutations: usize,
809 random_state: Option<u64>,
810 n_jobs: Option<usize>,
811) -> Result<PermutationTestResult>
812where
813 E: Clone,
814 F: Clone,
815 E: Fit<Array2<Float>, Array1<Float>, Fitted = F>,
816 F: Predict<Array2<Float>, Array1<Float>>,
817 F: Score<Array2<Float>, Array1<Float>, Float = f64>,
818 C: CrossValidator,
819{
820 use scirs2_core::random::prelude::*;
821 use scirs2_core::random::rngs::StdRng;
822
823 let scoring = scoring.unwrap_or(Scoring::EstimatorScore);
824
825 let original_scores =
827 cross_val_score(estimator.clone(), x, y, cv, Some(scoring.clone()), n_jobs)?;
828 let original_score = original_scores.mean().unwrap_or(0.0);
829
830 let mut rng = if let Some(seed) = random_state {
832 StdRng::seed_from_u64(seed)
833 } else {
834 StdRng::seed_from_u64(42)
835 };
836
837 let mut permutation_scores = Vec::with_capacity(n_permutations);
839
840 for _ in 0..n_permutations {
841 let mut y_permuted = y.to_owned();
843 let mut indices: Vec<usize> = (0..y.len()).collect();
844 indices.shuffle(&mut rng);
845
846 for (i, &perm_idx) in indices.iter().enumerate() {
847 y_permuted[i] = y[perm_idx];
848 }
849
850 let perm_scores = cross_val_score(
852 estimator.clone(),
853 x,
854 &y_permuted,
855 cv,
856 Some(scoring.clone()),
857 n_jobs,
858 )?;
859 let perm_score = perm_scores.mean().unwrap_or(0.0);
860 permutation_scores.push(perm_score);
861 }
862
863 let n_better_or_equal = permutation_scores
865 .iter()
866 .filter(|&&score| score >= original_score)
867 .count();
868 let p_value = (n_better_or_equal + 1) as f64 / (n_permutations + 1) as f64;
869
870 Ok(PermutationTestResult {
871 statistic: original_score,
872 pvalue: p_value,
873 permutation_scores: Array1::from_vec(permutation_scores),
874 })
875}
876
877#[derive(Debug, Clone)]
879pub struct PermutationTestResult {
880 pub statistic: f64,
882 pub pvalue: f64,
884 pub permutation_scores: Array1<f64>,
886}
887
888#[allow(clippy::too_many_arguments)]
894pub fn nested_cross_validate<E, F, C>(
895 estimator: E,
896 x: &Array2<Float>,
897 y: &Array1<Float>,
898 outer_cv: &C,
899 inner_cv: &C,
900 param_grid: &[ParameterValue],
901 param_config: ParamConfigFn<E>,
902 scoring: Option<fn(&Array1<Float>, &Array1<Float>) -> f64>,
903) -> Result<NestedCVResult>
904where
905 E: Clone,
906 F: Clone,
907 E: Fit<Array2<Float>, Array1<Float>, Fitted = F>,
908 F: Predict<Array2<Float>, Array1<Float>>,
909 F: Score<Array2<Float>, Array1<Float>, Float = f64>,
910 C: CrossValidator,
911{
912 let outer_splits = outer_cv.split(x.nrows(), None);
913 let mut outer_scores = Vec::with_capacity(outer_splits.len());
914 let mut best_params_per_fold = Vec::with_capacity(outer_splits.len());
915 let mut inner_scores_per_fold = Vec::with_capacity(outer_splits.len());
916
917 for (outer_train_idx, outer_test_idx) in outer_splits {
918 let outer_train_x = extract_rows(x, &outer_train_idx);
920 let outer_train_y = extract_elements(y, &outer_train_idx);
921 let outer_test_x = extract_rows(x, &outer_test_idx);
922 let outer_test_y = extract_elements(y, &outer_test_idx);
923
924 let mut best_score = f64::NEG_INFINITY;
926 let mut best_param = param_grid[0].clone();
927 let mut inner_scores = Vec::new();
928
929 for param in param_grid {
930 let param_estimator = param_config(estimator.clone(), param)?;
931
932 let inner_splits = inner_cv.split(outer_train_x.nrows(), None);
934 let mut param_scores = Vec::new();
935
936 for (inner_train_idx, inner_test_idx) in inner_splits {
937 let inner_train_x = extract_rows(&outer_train_x, &inner_train_idx);
938 let inner_train_y = extract_elements(&outer_train_y, &inner_train_idx);
939 let inner_test_x = extract_rows(&outer_train_x, &inner_test_idx);
940 let inner_test_y = extract_elements(&outer_train_y, &inner_test_idx);
941
942 let fitted = param_estimator
944 .clone()
945 .fit(&inner_train_x, &inner_train_y)?;
946 let predictions = fitted.predict(&inner_test_x)?;
947
948 let score = if let Some(scoring_fn) = scoring {
949 scoring_fn(&inner_test_y, &predictions)
950 } else {
951 fitted.score(&inner_test_x, &inner_test_y)?
952 };
953
954 param_scores.push(score);
955 }
956
957 let mean_score = param_scores.iter().sum::<f64>() / param_scores.len() as f64;
958 inner_scores.push(mean_score);
959
960 if mean_score > best_score {
961 best_score = mean_score;
962 best_param = param.clone();
963 }
964 }
965
966 let best_estimator = param_config(estimator.clone(), &best_param)?;
968 let final_fitted = best_estimator.fit(&outer_train_x, &outer_train_y)?;
969 let outer_predictions = final_fitted.predict(&outer_test_x)?;
970
971 let outer_score = if let Some(scoring_fn) = scoring {
972 scoring_fn(&outer_test_y, &outer_predictions)
973 } else {
974 final_fitted.score(&outer_test_x, &outer_test_y)?
975 };
976
977 outer_scores.push(outer_score);
978 best_params_per_fold.push(best_param);
979 inner_scores_per_fold.push(inner_scores);
980 }
981
982 let mean_score = outer_scores.iter().sum::<f64>() / outer_scores.len() as f64;
983 let std_score = {
984 let variance = outer_scores
985 .iter()
986 .map(|&x| (x - mean_score).powi(2))
987 .sum::<f64>()
988 / outer_scores.len() as f64;
989 variance.sqrt()
990 };
991
992 Ok(NestedCVResult {
993 outer_scores: Array1::from_vec(outer_scores),
994 best_params_per_fold,
995 inner_scores_per_fold,
996 mean_outer_score: mean_score,
997 std_outer_score: std_score,
998 })
999}
1000
1001#[derive(Debug, Clone)]
1003pub struct NestedCVResult {
1004 pub outer_scores: Array1<f64>,
1006 pub best_params_per_fold: Vec<ParameterValue>,
1008 pub inner_scores_per_fold: Vec<Vec<f64>>,
1010 pub mean_outer_score: f64,
1012 pub std_outer_score: f64,
1014}
1015
1016fn extract_rows(arr: &Array2<Float>, indices: &[usize]) -> Array2<Float> {
1018 let mut result = Array2::zeros((indices.len(), arr.ncols()));
1019 for (i, &idx) in indices.iter().enumerate() {
1020 for j in 0..arr.ncols() {
1021 result[[i, j]] = arr[[idx, j]];
1022 }
1023 }
1024 result
1025}
1026
1027fn extract_elements(arr: &Array1<Float>, indices: &[usize]) -> Array1<Float> {
1028 Array1::from_iter(indices.iter().map(|&i| arr[i]))
1029}
1030
1031#[allow(non_snake_case)]
1032#[cfg(test)]
1033mod tests {
1034 use super::*;
1035 use crate::KFold;
1036 use scirs2_core::ndarray::array;
1037
1038 #[derive(Clone)]
1040 struct MockEstimator;
1041
1042 #[derive(Clone)]
1043 struct MockFitted {
1044 train_mean: f64,
1045 }
1046
1047 impl Fit<Array2<Float>, Array1<Float>> for MockEstimator {
1048 type Fitted = MockFitted;
1049
1050 fn fit(self, _x: &Array2<Float>, y: &Array1<Float>) -> Result<Self::Fitted> {
1051 Ok(MockFitted {
1052 train_mean: y.mean().unwrap_or(0.0),
1053 })
1054 }
1055 }
1056
1057 impl Predict<Array2<Float>, Array1<Float>> for MockFitted {
1058 fn predict(&self, x: &Array2<Float>) -> Result<Array1<Float>> {
1059 Ok(Array1::from_elem(x.nrows(), self.train_mean))
1060 }
1061 }
1062
1063 impl Score<Array2<Float>, Array1<Float>> for MockFitted {
1064 type Float = Float;
1065
1066 fn score(&self, x: &Array2<Float>, y: &Array1<Float>) -> Result<f64> {
1067 let y_pred = self.predict(x)?;
1068 let mse = (y - &y_pred).mapv(|e| e * e).mean().unwrap_or(0.0);
1069 Ok(1.0 - mse) }
1071 }
1072
1073 #[test]
1074 fn test_cross_val_score() {
1075 let x = array![[1.0], [2.0], [3.0], [4.0], [5.0], [6.0]];
1076 let y = array![1.0, 2.0, 3.0, 4.0, 5.0, 6.0];
1077
1078 let estimator = MockEstimator;
1079 let cv = KFold::new(3);
1080
1081 let scores =
1082 cross_val_score(estimator, &x, &y, &cv, None, None).expect("operation should succeed");
1083
1084 assert_eq!(scores.len(), 3);
1085 for score in scores.iter() {
1087 assert!(*score <= 1.0);
1088 }
1089 }
1090
1091 #[test]
1092 fn test_cross_val_predict() {
1093 let x = array![[1.0], [2.0], [3.0], [4.0], [5.0], [6.0]];
1094 let y = array![1.0, 2.0, 3.0, 4.0, 5.0, 6.0];
1095
1096 let estimator = MockEstimator;
1097 let cv = KFold::new(3);
1098
1099 let predictions =
1100 cross_val_predict(estimator, &x, &y, &cv, None).expect("operation should succeed");
1101
1102 assert_eq!(predictions.len(), 6);
1103 }
1107
1108 #[test]
1109 fn test_learning_curve() {
1110 let x = array![
1111 [1.0],
1112 [2.0],
1113 [3.0],
1114 [4.0],
1115 [5.0],
1116 [6.0],
1117 [7.0],
1118 [8.0],
1119 [9.0],
1120 [10.0]
1121 ];
1122 let y = array![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0];
1123
1124 let estimator = MockEstimator;
1125 let cv = KFold::new(3);
1126
1127 let result = learning_curve(
1128 estimator,
1129 &x,
1130 &y,
1131 &cv,
1132 Some(vec![0.3, 0.6, 1.0]), None,
1134 None, )
1136 .expect("operation should succeed");
1137
1138 assert_eq!(result.train_sizes.len(), 3);
1140 assert_eq!(result.train_scores.dim(), (3, 3)); assert_eq!(result.test_scores.dim(), (3, 3));
1142
1143 assert_eq!(result.train_sizes[0], 3); assert_eq!(result.train_sizes[1], 6); assert_eq!(result.train_sizes[2], 10); let mean_train_score = result
1150 .train_scores
1151 .mean()
1152 .expect("operation should succeed");
1153 let mean_test_score = result.test_scores.mean().expect("operation should succeed");
1154 assert!(mean_train_score >= mean_test_score);
1156
1157 assert_eq!(result.train_scores_mean.len(), 3);
1159 assert_eq!(result.test_scores_mean.len(), 3);
1160 assert_eq!(result.train_scores_std.len(), 3);
1161 assert_eq!(result.test_scores_std.len(), 3);
1162 assert_eq!(result.train_scores_lower.len(), 3);
1163 assert_eq!(result.train_scores_upper.len(), 3);
1164 assert_eq!(result.test_scores_lower.len(), 3);
1165 assert_eq!(result.test_scores_upper.len(), 3);
1166
1167 for i in 0..3 {
1169 assert!(result.train_scores_lower[i] <= result.train_scores_mean[i]);
1170 assert!(result.train_scores_mean[i] <= result.train_scores_upper[i]);
1171 assert!(result.test_scores_lower[i] <= result.test_scores_mean[i]);
1172 assert!(result.test_scores_mean[i] <= result.test_scores_upper[i]);
1173 }
1174 }
1175
1176 #[test]
1177 fn test_validation_curve() {
1178 let x = array![[1.0], [2.0], [3.0], [4.0], [5.0], [6.0]];
1179 let y = array![1.0, 2.0, 3.0, 4.0, 5.0, 6.0];
1180
1181 let estimator = MockEstimator;
1182 let cv = KFold::new(3);
1183
1184 let param_config: ParamConfigFn<MockEstimator> = Box::new(|estimator, _param_value| {
1186 Ok(estimator)
1188 });
1189
1190 let param_range = vec![
1191 ParameterValue::Float(0.1),
1192 ParameterValue::Float(0.5),
1193 ParameterValue::Float(1.0),
1194 ];
1195
1196 let result = validation_curve(
1197 estimator,
1198 &x,
1199 &y,
1200 "mock_param",
1201 param_range.clone(),
1202 param_config,
1203 &cv,
1204 None,
1205 None, )
1207 .expect("operation should succeed");
1208
1209 assert_eq!(result.param_values.len(), 3);
1211 assert_eq!(result.train_scores.dim(), (3, 3)); assert_eq!(result.test_scores.dim(), (3, 3));
1213
1214 assert_eq!(result.param_values, param_range);
1216
1217 let train_score_std = {
1219 let mean = result
1220 .train_scores
1221 .mean()
1222 .expect("operation should succeed");
1223 let variance = result
1224 .train_scores
1225 .mapv(|x| (x - mean).powi(2))
1226 .mean()
1227 .expect("operation should succeed");
1228 variance.sqrt()
1229 };
1230
1231 assert!(train_score_std < 2.0);
1234
1235 assert_eq!(result.train_scores_mean.len(), 3);
1237 assert_eq!(result.test_scores_mean.len(), 3);
1238 assert_eq!(result.train_scores_std.len(), 3);
1239 assert_eq!(result.test_scores_std.len(), 3);
1240 assert_eq!(result.train_scores_lower.len(), 3);
1241 assert_eq!(result.train_scores_upper.len(), 3);
1242 assert_eq!(result.test_scores_lower.len(), 3);
1243 assert_eq!(result.test_scores_upper.len(), 3);
1244
1245 for i in 0..3 {
1247 assert!(result.train_scores_lower[i] <= result.train_scores_mean[i]);
1248 assert!(result.train_scores_mean[i] <= result.train_scores_upper[i]);
1249 assert!(result.test_scores_lower[i] <= result.test_scores_mean[i]);
1250 assert!(result.test_scores_mean[i] <= result.test_scores_upper[i]);
1251 }
1252 }
1253
1254 #[test]
1255 fn test_learning_curve_default_sizes() {
1256 let x = array![
1257 [1.0],
1258 [2.0],
1259 [3.0],
1260 [4.0],
1261 [5.0],
1262 [6.0],
1263 [7.0],
1264 [8.0],
1265 [9.0],
1266 [10.0]
1267 ];
1268 let y = array![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0];
1269
1270 let estimator = MockEstimator;
1271 let cv = KFold::new(2);
1272
1273 let result = learning_curve(
1274 estimator, &x, &y, &cv, None, None, None, )
1277 .expect("operation should succeed");
1278
1279 assert_eq!(result.train_sizes.len(), 6);
1281 assert_eq!(result.train_scores.dim(), (6, 2)); for i in 1..result.train_sizes.len() {
1285 assert!(result.train_sizes[i] >= result.train_sizes[i - 1]);
1286 }
1287 }
1288
1289 #[test]
1290 fn test_permutation_test_score() {
1291 let x = array![[1.0], [2.0], [3.0], [4.0], [5.0], [6.0], [7.0], [8.0]];
1292 let y = array![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
1293
1294 let estimator = MockEstimator;
1295 let cv = KFold::new(4);
1296
1297 let result = permutation_test_score(
1298 estimator,
1299 &x,
1300 &y,
1301 &cv,
1302 None,
1303 10, Some(42),
1305 None,
1306 )
1307 .expect("operation should succeed");
1308
1309 assert!(result.pvalue >= 0.0 && result.pvalue <= 1.0);
1311 assert_eq!(result.permutation_scores.len(), 10);
1312
1313 assert!(result.statistic.is_finite());
1316
1317 for &score in result.permutation_scores.iter() {
1319 assert!(score.is_finite());
1320 }
1321
1322 let n_better = result
1324 .permutation_scores
1325 .iter()
1326 .filter(|&&score| score >= result.statistic)
1327 .count();
1328 let expected_p = (n_better + 1) as f64 / 11.0; assert!((result.pvalue - expected_p).abs() < 1e-10);
1330 }
1331
1332 #[test]
1333 fn test_nested_cross_validate() {
1334 let x = array![
1335 [1.0],
1336 [2.0],
1337 [3.0],
1338 [4.0],
1339 [5.0],
1340 [6.0],
1341 [7.0],
1342 [8.0],
1343 [9.0],
1344 [10.0]
1345 ];
1346 let y = array![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0];
1347
1348 let estimator = MockEstimator;
1349 let outer_cv = KFold::new(3);
1350 let inner_cv = KFold::new(2);
1351
1352 let param_config: ParamConfigFn<MockEstimator> = Box::new(|estimator, _param_value| {
1354 Ok(estimator)
1356 });
1357
1358 let param_grid = vec![
1359 ParameterValue::Float(0.1),
1360 ParameterValue::Float(0.5),
1361 ParameterValue::Float(1.0),
1362 ];
1363
1364 let result = nested_cross_validate(
1365 estimator,
1366 &x,
1367 &y,
1368 &outer_cv,
1369 &inner_cv,
1370 ¶m_grid,
1371 param_config,
1372 None,
1373 )
1374 .expect("operation should succeed");
1375
1376 assert_eq!(result.outer_scores.len(), 3); assert_eq!(result.best_params_per_fold.len(), 3);
1379 assert_eq!(result.inner_scores_per_fold.len(), 3);
1380
1381 for inner_scores in &result.inner_scores_per_fold {
1383 assert_eq!(inner_scores.len(), 3); }
1385
1386 for &score in result.outer_scores.iter() {
1388 assert!(score.is_finite());
1389 }
1390
1391 let manual_mean =
1393 result.outer_scores.iter().sum::<f64>() / result.outer_scores.len() as f64;
1394 assert!((result.mean_outer_score - manual_mean).abs() < 1e-10);
1395
1396 assert!(result.std_outer_score >= 0.0);
1397 assert!(result.std_outer_score.is_finite());
1398 }
1399}