sklears_dummy/validation/
bootstrap_validation.rs1use super::validation_core::*;
2use super::validation_metrics::*;
3
4use scirs2_core::ndarray::{Array1, Axis};
5use scirs2_core::random::{Rng, RngCore};
6use sklears_core::error::{Result, SklearsError};
7use sklears_core::traits::{Fit, Predict};
8use sklears_core::types::{Features, Float, Int};
9use std::collections::HashMap;
10
11use crate::{ClassifierStrategy, DummyClassifier, DummyRegressor, RegressorStrategy};
12
13#[derive(Debug, Clone)]
15pub struct BootstrapValidationResult {
16 pub bootstrap_scores: Vec<Float>,
18 pub mean_score: Float,
20 pub std_score: Float,
22 pub confidence_interval: (Float, Float),
24 pub bias: Float,
26 pub strategy: String,
28 pub n_bootstrap_samples: usize,
30}
31
32impl BootstrapValidationResult {
33 pub fn new(bootstrap_scores: Vec<Float>, strategy: String, confidence_level: Float) -> Self {
34 let n_bootstrap_samples = bootstrap_scores.len();
35 let mean_score = bootstrap_scores.iter().sum::<Float>() / n_bootstrap_samples as Float;
36
37 let variance = bootstrap_scores
38 .iter()
39 .map(|&score| (score - mean_score).powi(2))
40 .sum::<Float>()
41 / n_bootstrap_samples as Float;
42 let std_score = variance.sqrt();
43
44 let mut sorted_scores = bootstrap_scores.clone();
46 sorted_scores.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
47
48 let alpha = 1.0 - confidence_level;
49 let lower_idx = (alpha / 2.0 * n_bootstrap_samples as Float) as usize;
50 let upper_idx = ((1.0 - alpha / 2.0) * n_bootstrap_samples as Float) as usize;
51
52 let lower_bound = sorted_scores[lower_idx.min(n_bootstrap_samples - 1)];
53 let upper_bound = sorted_scores[upper_idx.min(n_bootstrap_samples - 1)];
54 let confidence_interval = (lower_bound, upper_bound);
55
56 let bias = 0.0; Self {
60 bootstrap_scores,
61 mean_score,
62 std_score,
63 confidence_interval,
64 bias,
65 strategy,
66 n_bootstrap_samples,
67 }
68 }
69
70 pub fn percentile(&self, p: Float) -> Float {
71 let mut sorted_scores = self.bootstrap_scores.clone();
72 sorted_scores.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
73
74 let idx = (p * self.n_bootstrap_samples as Float) as usize;
75 sorted_scores[idx.min(self.n_bootstrap_samples - 1)]
76 }
77
78 pub fn bootstrap_distribution_summary(&self) -> StatisticalSummary {
79 StatisticalSummary::from_scores(&self.bootstrap_scores)
80 }
81}
82
83pub fn bootstrap_validate_classifier(
85 classifier: DummyClassifier,
86 x: &Features,
87 y: &Array1<Int>,
88 n_bootstrap: usize,
89 random_state: Option<u64>,
90) -> Result<BootstrapValidationResult> {
91 if n_bootstrap < 1 {
92 return Err(SklearsError::InvalidInput(
93 "Number of bootstrap samples must be at least 1".to_string(),
94 ));
95 }
96
97 let n_samples = x.nrows();
98 if n_samples == 0 {
99 return Err(SklearsError::InvalidInput(
100 "Cannot perform bootstrap validation on empty dataset".to_string(),
101 ));
102 }
103
104 let mut rng = create_rng(random_state);
105 let mut bootstrap_scores = Vec::with_capacity(n_bootstrap);
106
107 for _ in 0..n_bootstrap {
108 let bootstrap_indices = create_bootstrap_sample(n_samples, &mut *rng);
110 let oob_indices = create_out_of_bag_indices(&bootstrap_indices, n_samples);
111
112 if oob_indices.is_empty() {
113 continue; }
115
116 let x_bootstrap = x.select(Axis(0), &bootstrap_indices);
118 let y_bootstrap = y.select(Axis(0), &bootstrap_indices);
119
120 let x_oob = x.select(Axis(0), &oob_indices);
122 let y_oob = y.select(Axis(0), &oob_indices);
123
124 let fitted = classifier.clone().fit(&x_bootstrap, &y_bootstrap)?;
126 let predictions = fitted.predict(&x_oob)?;
127
128 let correct = predictions
130 .iter()
131 .zip(y_oob.iter())
132 .filter(|(&pred, &actual)| pred == actual)
133 .count();
134 let accuracy = correct as Float / oob_indices.len() as Float;
135 bootstrap_scores.push(accuracy);
136 }
137
138 if bootstrap_scores.is_empty() {
139 return Err(SklearsError::InvalidInput(
140 "No valid bootstrap samples created".to_string(),
141 ));
142 }
143
144 Ok(BootstrapValidationResult::new(
145 bootstrap_scores,
146 format!("{:?}", classifier.strategy),
147 0.95,
148 ))
149}
150
151pub fn bootstrap_validate_regressor(
153 regressor: DummyRegressor,
154 x: &Features,
155 y: &Array1<Float>,
156 n_bootstrap: usize,
157 random_state: Option<u64>,
158) -> Result<BootstrapValidationResult> {
159 if n_bootstrap < 1 {
160 return Err(SklearsError::InvalidInput(
161 "Number of bootstrap samples must be at least 1".to_string(),
162 ));
163 }
164
165 let n_samples = x.nrows();
166 if n_samples == 0 {
167 return Err(SklearsError::InvalidInput(
168 "Cannot perform bootstrap validation on empty dataset".to_string(),
169 ));
170 }
171
172 let mut rng = create_rng(random_state);
173 let mut bootstrap_scores = Vec::with_capacity(n_bootstrap);
174
175 for _ in 0..n_bootstrap {
176 let bootstrap_indices = create_bootstrap_sample(n_samples, &mut *rng);
178 let oob_indices = create_out_of_bag_indices(&bootstrap_indices, n_samples);
179
180 if oob_indices.is_empty() {
181 continue; }
183
184 let x_bootstrap = x.select(Axis(0), &bootstrap_indices);
186 let y_bootstrap = y.select(Axis(0), &bootstrap_indices);
187
188 let x_oob = x.select(Axis(0), &oob_indices);
190 let y_oob = y.select(Axis(0), &oob_indices);
191
192 let fitted = regressor.clone().fit(&x_bootstrap, &y_bootstrap)?;
194 let predictions = fitted.predict(&x_oob)?;
195
196 let mse = predictions
198 .iter()
199 .zip(y_oob.iter())
200 .map(|(&pred, &actual)| (pred - actual).powi(2))
201 .sum::<Float>()
202 / oob_indices.len() as Float;
203 bootstrap_scores.push(-mse);
204 }
205
206 if bootstrap_scores.is_empty() {
207 return Err(SklearsError::InvalidInput(
208 "No valid bootstrap samples created".to_string(),
209 ));
210 }
211
212 Ok(BootstrapValidationResult::new(
213 bootstrap_scores,
214 format!("{:?}", regressor.strategy),
215 0.95,
216 ))
217}
218
219fn create_bootstrap_sample(n_samples: usize, rng: &mut dyn RngCore) -> Vec<usize> {
221 (0..n_samples)
222 .map(|_| rng.gen_range(0..n_samples))
223 .collect()
224}
225
226fn create_out_of_bag_indices(bootstrap_indices: &[usize], n_samples: usize) -> Vec<usize> {
228 let mut in_bootstrap = vec![false; n_samples];
229 for &idx in bootstrap_indices {
230 in_bootstrap[idx] = true;
231 }
232
233 (0..n_samples).filter(|&i| !in_bootstrap[i]).collect()
234}
235
236pub fn bootstrap_compare_strategies(
238 strategies: &[String],
239 x: &Features,
240 y: &Array1<Float>,
241 n_bootstrap: usize,
242 random_state: Option<u64>,
243) -> Result<Vec<BootstrapValidationResult>> {
244 if strategies.is_empty() {
245 return Err(SklearsError::InvalidInput(
246 "At least one strategy must be provided".to_string(),
247 ));
248 }
249
250 let mut results = Vec::new();
251 let is_classification = is_classification_task(y);
252
253 if is_classification {
254 let y_int: Array1<Int> = y.mapv(|x| x as Int);
255
256 for strategy_name in strategies {
257 let strategy = parse_classifier_strategy(strategy_name)?;
258 let classifier = DummyClassifier::new(strategy);
259 let result =
260 bootstrap_validate_classifier(classifier, x, &y_int, n_bootstrap, random_state)?;
261 results.push(result);
262 }
263 } else {
264 for strategy_name in strategies {
265 let strategy = parse_regressor_strategy(strategy_name)?;
266 let regressor = DummyRegressor::new(strategy);
267 let result = bootstrap_validate_regressor(regressor, x, y, n_bootstrap, random_state)?;
268 results.push(result);
269 }
270 }
271
272 Ok(results)
273}
274
275pub fn bootstrap_hypothesis_test(
277 strategy1: DummyClassifier,
278 strategy2: DummyClassifier,
279 x: &Features,
280 y: &Array1<Int>,
281 n_bootstrap: usize,
282 random_state: Option<u64>,
283) -> Result<BootstrapHypothesisTest> {
284 let mut rng = create_rng(random_state);
285 let n_samples = x.nrows();
286
287 let mut differences = Vec::with_capacity(n_bootstrap);
288
289 for _ in 0..n_bootstrap {
290 let bootstrap_indices = create_bootstrap_sample(n_samples, &mut *rng);
292 let oob_indices = create_out_of_bag_indices(&bootstrap_indices, n_samples);
293
294 if oob_indices.is_empty() {
295 continue;
296 }
297
298 let x_bootstrap = x.select(Axis(0), &bootstrap_indices);
300 let y_bootstrap = y.select(Axis(0), &bootstrap_indices);
301 let x_oob = x.select(Axis(0), &oob_indices);
302 let y_oob = y.select(Axis(0), &oob_indices);
303
304 let fitted1 = strategy1.clone().fit(&x_bootstrap, &y_bootstrap)?;
306 let predictions1 = fitted1.predict(&x_oob)?;
307 let score1 = calculate_classification_score(&predictions1, &y_oob, "accuracy")?;
308
309 let fitted2 = strategy2.clone().fit(&x_bootstrap, &y_bootstrap)?;
310 let predictions2 = fitted2.predict(&x_oob)?;
311 let score2 = calculate_classification_score(&predictions2, &y_oob, "accuracy")?;
312
313 differences.push(score1 - score2);
314 }
315
316 Ok(BootstrapHypothesisTest::new(differences))
317}
318
319#[derive(Debug, Clone)]
321pub struct BootstrapHypothesisTest {
322 pub differences: Vec<Float>,
324 pub mean_difference: Float,
326 pub std_difference: Float,
328 pub p_value: Float,
330 pub confidence_interval: (Float, Float),
332}
333
334impl BootstrapHypothesisTest {
335 pub fn new(differences: Vec<Float>) -> Self {
336 let n = differences.len();
337 let mean_difference = differences.iter().sum::<Float>() / n as Float;
338
339 let variance = differences
340 .iter()
341 .map(|&d| (d - mean_difference).powi(2))
342 .sum::<Float>()
343 / n as Float;
344 let std_difference = variance.sqrt();
345
346 let negative_count = differences.iter().filter(|&&d| d < 0.0).count();
348 let positive_count = differences.iter().filter(|&&d| d > 0.0).count();
349 let p_value = 2.0 * (negative_count.min(positive_count) as Float / n as Float);
350
351 let mut sorted_diffs = differences.clone();
353 sorted_diffs.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
354
355 let lower_idx = (0.025 * n as Float) as usize;
356 let upper_idx = (0.975 * n as Float) as usize;
357 let confidence_interval = (
358 sorted_diffs[lower_idx.min(n - 1)],
359 sorted_diffs[upper_idx.min(n - 1)],
360 );
361
362 Self {
363 differences,
364 mean_difference,
365 std_difference,
366 p_value,
367 confidence_interval,
368 }
369 }
370
371 pub fn is_significant(&self, alpha: Float) -> bool {
372 self.p_value < alpha
373 }
374
375 pub fn effect_size(&self) -> Float {
376 if self.std_difference > 0.0 {
377 self.mean_difference / self.std_difference
378 } else {
379 0.0
380 }
381 }
382}
383
384pub fn stratified_bootstrap_validate_classifier(
386 classifier: DummyClassifier,
387 x: &Features,
388 y: &Array1<Int>,
389 n_bootstrap: usize,
390 random_state: Option<u64>,
391) -> Result<BootstrapValidationResult> {
392 let mut rng = create_rng(random_state);
393 let n_samples = x.nrows();
394
395 let mut class_indices: HashMap<Int, Vec<usize>> = HashMap::new();
397 for (i, &class) in y.iter().enumerate() {
398 class_indices.entry(class).or_default().push(i);
399 }
400
401 let mut bootstrap_scores = Vec::with_capacity(n_bootstrap);
402
403 for _ in 0..n_bootstrap {
404 let mut bootstrap_indices = Vec::new();
405 let mut oob_indices = Vec::new();
406
407 for indices in class_indices.values() {
409 let class_bootstrap = create_bootstrap_sample(indices.len(), &mut *rng);
410 let class_bootstrap_indices: Vec<usize> =
411 class_bootstrap.iter().map(|&i| indices[i]).collect();
412
413 let class_oob = create_out_of_bag_indices(&class_bootstrap, indices.len());
414 let class_oob_indices: Vec<usize> = class_oob.iter().map(|&i| indices[i]).collect();
415
416 bootstrap_indices.extend(class_bootstrap_indices);
417 oob_indices.extend(class_oob_indices);
418 }
419
420 if oob_indices.is_empty() {
421 continue;
422 }
423
424 let x_bootstrap = x.select(Axis(0), &bootstrap_indices);
426 let y_bootstrap = y.select(Axis(0), &bootstrap_indices);
427 let x_oob = x.select(Axis(0), &oob_indices);
428 let y_oob = y.select(Axis(0), &oob_indices);
429
430 let fitted = classifier.clone().fit(&x_bootstrap, &y_bootstrap)?;
431 let predictions = fitted.predict(&x_oob)?;
432
433 let correct = predictions
434 .iter()
435 .zip(y_oob.iter())
436 .filter(|(&pred, &actual)| pred == actual)
437 .count();
438 let accuracy = correct as Float / oob_indices.len() as Float;
439 bootstrap_scores.push(accuracy);
440 }
441
442 if bootstrap_scores.is_empty() {
443 return Err(SklearsError::InvalidInput(
444 "No valid bootstrap samples created".to_string(),
445 ));
446 }
447
448 Ok(BootstrapValidationResult::new(
449 bootstrap_scores,
450 format!("{:?}", classifier.strategy),
451 0.95,
452 ))
453}
454
455fn parse_classifier_strategy(strategy: &str) -> Result<ClassifierStrategy> {
457 match strategy.to_lowercase().as_str() {
458 "mostfrequent" | "most_frequent" => Ok(ClassifierStrategy::MostFrequent),
459 "stratified" => Ok(ClassifierStrategy::Stratified),
460 "uniform" => Ok(ClassifierStrategy::Uniform),
461 "constant" => Ok(ClassifierStrategy::Constant),
462 _ => Err(SklearsError::InvalidInput(format!(
463 "Unknown classifier strategy: {}",
464 strategy
465 ))),
466 }
467}
468
469fn parse_regressor_strategy(strategy: &str) -> Result<RegressorStrategy> {
470 match strategy.to_lowercase().as_str() {
471 "mean" => Ok(RegressorStrategy::Mean),
472 "median" => Ok(RegressorStrategy::Median),
473 "quantile" => Ok(RegressorStrategy::Quantile(0.5)),
474 "constant" => Ok(RegressorStrategy::Constant(0.0)),
475 _ => Err(SklearsError::InvalidInput(format!(
476 "Unknown regressor strategy: {}",
477 strategy
478 ))),
479 }
480}