sklears_dummy/validation/
bootstrap_validation.rs

1use super::validation_core::*;
2use super::validation_metrics::*;
3
4use scirs2_core::ndarray::{Array1, Axis};
5use scirs2_core::random::{Rng, RngCore};
6use sklears_core::error::{Result, SklearsError};
7use sklears_core::traits::{Fit, Predict};
8use sklears_core::types::{Features, Float, Int};
9use std::collections::HashMap;
10
11use crate::{ClassifierStrategy, DummyClassifier, DummyRegressor, RegressorStrategy};
12
13/// Bootstrap validation result
14#[derive(Debug, Clone)]
15pub struct BootstrapValidationResult {
16    /// bootstrap_scores
17    pub bootstrap_scores: Vec<Float>,
18    /// mean_score
19    pub mean_score: Float,
20    /// std_score
21    pub std_score: Float,
22    /// confidence_interval
23    pub confidence_interval: (Float, Float),
24    /// bias
25    pub bias: Float,
26    /// strategy
27    pub strategy: String,
28    /// n_bootstrap_samples
29    pub n_bootstrap_samples: usize,
30}
31
32impl BootstrapValidationResult {
33    pub fn new(bootstrap_scores: Vec<Float>, strategy: String, confidence_level: Float) -> Self {
34        let n_bootstrap_samples = bootstrap_scores.len();
35        let mean_score = bootstrap_scores.iter().sum::<Float>() / n_bootstrap_samples as Float;
36
37        let variance = bootstrap_scores
38            .iter()
39            .map(|&score| (score - mean_score).powi(2))
40            .sum::<Float>()
41            / n_bootstrap_samples as Float;
42        let std_score = variance.sqrt();
43
44        // Calculate confidence interval using percentile method
45        let mut sorted_scores = bootstrap_scores.clone();
46        sorted_scores.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
47
48        let alpha = 1.0 - confidence_level;
49        let lower_idx = (alpha / 2.0 * n_bootstrap_samples as Float) as usize;
50        let upper_idx = ((1.0 - alpha / 2.0) * n_bootstrap_samples as Float) as usize;
51
52        let lower_bound = sorted_scores[lower_idx.min(n_bootstrap_samples - 1)];
53        let upper_bound = sorted_scores[upper_idx.min(n_bootstrap_samples - 1)];
54        let confidence_interval = (lower_bound, upper_bound);
55
56        // Bias calculation (simplified)
57        let bias = 0.0; // Would require original score for proper bias calculation
58
59        Self {
60            bootstrap_scores,
61            mean_score,
62            std_score,
63            confidence_interval,
64            bias,
65            strategy,
66            n_bootstrap_samples,
67        }
68    }
69
70    pub fn percentile(&self, p: Float) -> Float {
71        let mut sorted_scores = self.bootstrap_scores.clone();
72        sorted_scores.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
73
74        let idx = (p * self.n_bootstrap_samples as Float) as usize;
75        sorted_scores[idx.min(self.n_bootstrap_samples - 1)]
76    }
77
78    pub fn bootstrap_distribution_summary(&self) -> StatisticalSummary {
79        StatisticalSummary::from_scores(&self.bootstrap_scores)
80    }
81}
82
83/// Perform bootstrap validation for a dummy classifier
84pub fn bootstrap_validate_classifier(
85    classifier: DummyClassifier,
86    x: &Features,
87    y: &Array1<Int>,
88    n_bootstrap: usize,
89    random_state: Option<u64>,
90) -> Result<BootstrapValidationResult> {
91    if n_bootstrap < 1 {
92        return Err(SklearsError::InvalidInput(
93            "Number of bootstrap samples must be at least 1".to_string(),
94        ));
95    }
96
97    let n_samples = x.nrows();
98    if n_samples == 0 {
99        return Err(SklearsError::InvalidInput(
100            "Cannot perform bootstrap validation on empty dataset".to_string(),
101        ));
102    }
103
104    let mut rng = create_rng(random_state);
105    let mut bootstrap_scores = Vec::with_capacity(n_bootstrap);
106
107    for _ in 0..n_bootstrap {
108        // Create bootstrap sample
109        let bootstrap_indices = create_bootstrap_sample(n_samples, &mut *rng);
110        let oob_indices = create_out_of_bag_indices(&bootstrap_indices, n_samples);
111
112        if oob_indices.is_empty() {
113            continue; // Skip if no out-of-bag samples
114        }
115
116        // Extract bootstrap training data
117        let x_bootstrap = x.select(Axis(0), &bootstrap_indices);
118        let y_bootstrap = y.select(Axis(0), &bootstrap_indices);
119
120        // Extract out-of-bag test data
121        let x_oob = x.select(Axis(0), &oob_indices);
122        let y_oob = y.select(Axis(0), &oob_indices);
123
124        // Fit on bootstrap sample and predict on out-of-bag
125        let fitted = classifier.clone().fit(&x_bootstrap, &y_bootstrap)?;
126        let predictions = fitted.predict(&x_oob)?;
127
128        // Calculate accuracy
129        let correct = predictions
130            .iter()
131            .zip(y_oob.iter())
132            .filter(|(&pred, &actual)| pred == actual)
133            .count();
134        let accuracy = correct as Float / oob_indices.len() as Float;
135        bootstrap_scores.push(accuracy);
136    }
137
138    if bootstrap_scores.is_empty() {
139        return Err(SklearsError::InvalidInput(
140            "No valid bootstrap samples created".to_string(),
141        ));
142    }
143
144    Ok(BootstrapValidationResult::new(
145        bootstrap_scores,
146        format!("{:?}", classifier.strategy),
147        0.95,
148    ))
149}
150
151/// Perform bootstrap validation for a dummy regressor
152pub fn bootstrap_validate_regressor(
153    regressor: DummyRegressor,
154    x: &Features,
155    y: &Array1<Float>,
156    n_bootstrap: usize,
157    random_state: Option<u64>,
158) -> Result<BootstrapValidationResult> {
159    if n_bootstrap < 1 {
160        return Err(SklearsError::InvalidInput(
161            "Number of bootstrap samples must be at least 1".to_string(),
162        ));
163    }
164
165    let n_samples = x.nrows();
166    if n_samples == 0 {
167        return Err(SklearsError::InvalidInput(
168            "Cannot perform bootstrap validation on empty dataset".to_string(),
169        ));
170    }
171
172    let mut rng = create_rng(random_state);
173    let mut bootstrap_scores = Vec::with_capacity(n_bootstrap);
174
175    for _ in 0..n_bootstrap {
176        // Create bootstrap sample
177        let bootstrap_indices = create_bootstrap_sample(n_samples, &mut *rng);
178        let oob_indices = create_out_of_bag_indices(&bootstrap_indices, n_samples);
179
180        if oob_indices.is_empty() {
181            continue; // Skip if no out-of-bag samples
182        }
183
184        // Extract bootstrap training data
185        let x_bootstrap = x.select(Axis(0), &bootstrap_indices);
186        let y_bootstrap = y.select(Axis(0), &bootstrap_indices);
187
188        // Extract out-of-bag test data
189        let x_oob = x.select(Axis(0), &oob_indices);
190        let y_oob = y.select(Axis(0), &oob_indices);
191
192        // Fit on bootstrap sample and predict on out-of-bag
193        let fitted = regressor.clone().fit(&x_bootstrap, &y_bootstrap)?;
194        let predictions = fitted.predict(&x_oob)?;
195
196        // Calculate negative MSE
197        let mse = predictions
198            .iter()
199            .zip(y_oob.iter())
200            .map(|(&pred, &actual)| (pred - actual).powi(2))
201            .sum::<Float>()
202            / oob_indices.len() as Float;
203        bootstrap_scores.push(-mse);
204    }
205
206    if bootstrap_scores.is_empty() {
207        return Err(SklearsError::InvalidInput(
208            "No valid bootstrap samples created".to_string(),
209        ));
210    }
211
212    Ok(BootstrapValidationResult::new(
213        bootstrap_scores,
214        format!("{:?}", regressor.strategy),
215        0.95,
216    ))
217}
218
219/// Create a bootstrap sample of indices
220fn create_bootstrap_sample(n_samples: usize, rng: &mut dyn RngCore) -> Vec<usize> {
221    (0..n_samples)
222        .map(|_| rng.gen_range(0..n_samples))
223        .collect()
224}
225
226/// Create out-of-bag indices (samples not in bootstrap sample)
227fn create_out_of_bag_indices(bootstrap_indices: &[usize], n_samples: usize) -> Vec<usize> {
228    let mut in_bootstrap = vec![false; n_samples];
229    for &idx in bootstrap_indices {
230        in_bootstrap[idx] = true;
231    }
232
233    (0..n_samples).filter(|&i| !in_bootstrap[i]).collect()
234}
235
236/// Perform bootstrap validation with multiple strategies
237pub fn bootstrap_compare_strategies(
238    strategies: &[String],
239    x: &Features,
240    y: &Array1<Float>,
241    n_bootstrap: usize,
242    random_state: Option<u64>,
243) -> Result<Vec<BootstrapValidationResult>> {
244    if strategies.is_empty() {
245        return Err(SklearsError::InvalidInput(
246            "At least one strategy must be provided".to_string(),
247        ));
248    }
249
250    let mut results = Vec::new();
251    let is_classification = is_classification_task(y);
252
253    if is_classification {
254        let y_int: Array1<Int> = y.mapv(|x| x as Int);
255
256        for strategy_name in strategies {
257            let strategy = parse_classifier_strategy(strategy_name)?;
258            let classifier = DummyClassifier::new(strategy);
259            let result =
260                bootstrap_validate_classifier(classifier, x, &y_int, n_bootstrap, random_state)?;
261            results.push(result);
262        }
263    } else {
264        for strategy_name in strategies {
265            let strategy = parse_regressor_strategy(strategy_name)?;
266            let regressor = DummyRegressor::new(strategy);
267            let result = bootstrap_validate_regressor(regressor, x, y, n_bootstrap, random_state)?;
268            results.push(result);
269        }
270    }
271
272    Ok(results)
273}
274
275/// Bootstrap hypothesis test for comparing two strategies
276pub fn bootstrap_hypothesis_test(
277    strategy1: DummyClassifier,
278    strategy2: DummyClassifier,
279    x: &Features,
280    y: &Array1<Int>,
281    n_bootstrap: usize,
282    random_state: Option<u64>,
283) -> Result<BootstrapHypothesisTest> {
284    let mut rng = create_rng(random_state);
285    let n_samples = x.nrows();
286
287    let mut differences = Vec::with_capacity(n_bootstrap);
288
289    for _ in 0..n_bootstrap {
290        // Create bootstrap sample
291        let bootstrap_indices = create_bootstrap_sample(n_samples, &mut *rng);
292        let oob_indices = create_out_of_bag_indices(&bootstrap_indices, n_samples);
293
294        if oob_indices.is_empty() {
295            continue;
296        }
297
298        // Extract data
299        let x_bootstrap = x.select(Axis(0), &bootstrap_indices);
300        let y_bootstrap = y.select(Axis(0), &bootstrap_indices);
301        let x_oob = x.select(Axis(0), &oob_indices);
302        let y_oob = y.select(Axis(0), &oob_indices);
303
304        // Evaluate both strategies
305        let fitted1 = strategy1.clone().fit(&x_bootstrap, &y_bootstrap)?;
306        let predictions1 = fitted1.predict(&x_oob)?;
307        let score1 = calculate_classification_score(&predictions1, &y_oob, "accuracy")?;
308
309        let fitted2 = strategy2.clone().fit(&x_bootstrap, &y_bootstrap)?;
310        let predictions2 = fitted2.predict(&x_oob)?;
311        let score2 = calculate_classification_score(&predictions2, &y_oob, "accuracy")?;
312
313        differences.push(score1 - score2);
314    }
315
316    Ok(BootstrapHypothesisTest::new(differences))
317}
318
319/// Bootstrap hypothesis test result
320#[derive(Debug, Clone)]
321pub struct BootstrapHypothesisTest {
322    /// differences
323    pub differences: Vec<Float>,
324    /// mean_difference
325    pub mean_difference: Float,
326    /// std_difference
327    pub std_difference: Float,
328    /// p_value
329    pub p_value: Float,
330    /// confidence_interval
331    pub confidence_interval: (Float, Float),
332}
333
334impl BootstrapHypothesisTest {
335    pub fn new(differences: Vec<Float>) -> Self {
336        let n = differences.len();
337        let mean_difference = differences.iter().sum::<Float>() / n as Float;
338
339        let variance = differences
340            .iter()
341            .map(|&d| (d - mean_difference).powi(2))
342            .sum::<Float>()
343            / n as Float;
344        let std_difference = variance.sqrt();
345
346        // Calculate p-value (two-tailed test for H0: difference = 0)
347        let negative_count = differences.iter().filter(|&&d| d < 0.0).count();
348        let positive_count = differences.iter().filter(|&&d| d > 0.0).count();
349        let p_value = 2.0 * (negative_count.min(positive_count) as Float / n as Float);
350
351        // Calculate 95% confidence interval
352        let mut sorted_diffs = differences.clone();
353        sorted_diffs.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
354
355        let lower_idx = (0.025 * n as Float) as usize;
356        let upper_idx = (0.975 * n as Float) as usize;
357        let confidence_interval = (
358            sorted_diffs[lower_idx.min(n - 1)],
359            sorted_diffs[upper_idx.min(n - 1)],
360        );
361
362        Self {
363            differences,
364            mean_difference,
365            std_difference,
366            p_value,
367            confidence_interval,
368        }
369    }
370
371    pub fn is_significant(&self, alpha: Float) -> bool {
372        self.p_value < alpha
373    }
374
375    pub fn effect_size(&self) -> Float {
376        if self.std_difference > 0.0 {
377            self.mean_difference / self.std_difference
378        } else {
379            0.0
380        }
381    }
382}
383
384/// Stratified bootstrap validation
385pub fn stratified_bootstrap_validate_classifier(
386    classifier: DummyClassifier,
387    x: &Features,
388    y: &Array1<Int>,
389    n_bootstrap: usize,
390    random_state: Option<u64>,
391) -> Result<BootstrapValidationResult> {
392    let mut rng = create_rng(random_state);
393    let n_samples = x.nrows();
394
395    // Group indices by class
396    let mut class_indices: HashMap<Int, Vec<usize>> = HashMap::new();
397    for (i, &class) in y.iter().enumerate() {
398        class_indices.entry(class).or_default().push(i);
399    }
400
401    let mut bootstrap_scores = Vec::with_capacity(n_bootstrap);
402
403    for _ in 0..n_bootstrap {
404        let mut bootstrap_indices = Vec::new();
405        let mut oob_indices = Vec::new();
406
407        // Create stratified bootstrap sample
408        for indices in class_indices.values() {
409            let class_bootstrap = create_bootstrap_sample(indices.len(), &mut *rng);
410            let class_bootstrap_indices: Vec<usize> =
411                class_bootstrap.iter().map(|&i| indices[i]).collect();
412
413            let class_oob = create_out_of_bag_indices(&class_bootstrap, indices.len());
414            let class_oob_indices: Vec<usize> = class_oob.iter().map(|&i| indices[i]).collect();
415
416            bootstrap_indices.extend(class_bootstrap_indices);
417            oob_indices.extend(class_oob_indices);
418        }
419
420        if oob_indices.is_empty() {
421            continue;
422        }
423
424        // Extract data and evaluate
425        let x_bootstrap = x.select(Axis(0), &bootstrap_indices);
426        let y_bootstrap = y.select(Axis(0), &bootstrap_indices);
427        let x_oob = x.select(Axis(0), &oob_indices);
428        let y_oob = y.select(Axis(0), &oob_indices);
429
430        let fitted = classifier.clone().fit(&x_bootstrap, &y_bootstrap)?;
431        let predictions = fitted.predict(&x_oob)?;
432
433        let correct = predictions
434            .iter()
435            .zip(y_oob.iter())
436            .filter(|(&pred, &actual)| pred == actual)
437            .count();
438        let accuracy = correct as Float / oob_indices.len() as Float;
439        bootstrap_scores.push(accuracy);
440    }
441
442    if bootstrap_scores.is_empty() {
443        return Err(SklearsError::InvalidInput(
444            "No valid bootstrap samples created".to_string(),
445        ));
446    }
447
448    Ok(BootstrapValidationResult::new(
449        bootstrap_scores,
450        format!("{:?}", classifier.strategy),
451        0.95,
452    ))
453}
454
455/// Helper functions for parsing strategies (simplified versions)
456fn parse_classifier_strategy(strategy: &str) -> Result<ClassifierStrategy> {
457    match strategy.to_lowercase().as_str() {
458        "mostfrequent" | "most_frequent" => Ok(ClassifierStrategy::MostFrequent),
459        "stratified" => Ok(ClassifierStrategy::Stratified),
460        "uniform" => Ok(ClassifierStrategy::Uniform),
461        "constant" => Ok(ClassifierStrategy::Constant),
462        _ => Err(SklearsError::InvalidInput(format!(
463            "Unknown classifier strategy: {}",
464            strategy
465        ))),
466    }
467}
468
469fn parse_regressor_strategy(strategy: &str) -> Result<RegressorStrategy> {
470    match strategy.to_lowercase().as_str() {
471        "mean" => Ok(RegressorStrategy::Mean),
472        "median" => Ok(RegressorStrategy::Median),
473        "quantile" => Ok(RegressorStrategy::Quantile(0.5)),
474        "constant" => Ok(RegressorStrategy::Constant(0.0)),
475        _ => Err(SklearsError::InvalidInput(format!(
476            "Unknown regressor strategy: {}",
477            strategy
478        ))),
479    }
480}