ghostflow_ml/
model_selection_extended.rs

1//! Extended Model Selection - RandomizedSearchCV, GroupKFold, RepeatedKFold
2
3use ghostflow_core::Tensor;
4use rand::prelude::*;
5use std::collections::HashMap;
6
7/// Parameter distribution for randomized search
8#[derive(Clone)]
9pub enum ParamDistribution {
10    /// Uniform distribution over continuous range
11    Uniform { low: f32, high: f32 },
12    /// Log-uniform distribution (for learning rates, regularization)
13    LogUniform { low: f32, high: f32 },
14    /// Discrete uniform over integers
15    IntUniform { low: i32, high: i32 },
16    /// Choice from a list of values
17    Choice(Vec<f32>),
18    /// Choice from a list of integers
19    IntChoice(Vec<i32>),
20}
21
22impl ParamDistribution {
23    pub fn sample(&self, rng: &mut impl Rng) -> f32 {
24        match self {
25            ParamDistribution::Uniform { low, high } => {
26                rng.gen::<f32>() * (high - low) + low
27            }
28            ParamDistribution::LogUniform { low, high } => {
29                let log_low = low.ln();
30                let log_high = high.ln();
31                (rng.gen::<f32>() * (log_high - log_low) + log_low).exp()
32            }
33            ParamDistribution::IntUniform { low, high } => {
34                rng.gen_range(*low..=*high) as f32
35            }
36            ParamDistribution::Choice(values) => {
37                values[rng.gen_range(0..values.len())]
38            }
39            ParamDistribution::IntChoice(values) => {
40                values[rng.gen_range(0..values.len())] as f32
41            }
42        }
43    }
44}
45
46/// Result of randomized search
47#[derive(Clone)]
48pub struct RandomizedSearchResult {
49    pub best_params: HashMap<String, f32>,
50    pub best_score: f32,
51    pub cv_results: Vec<CVResult>,
52}
53
54#[derive(Clone)]
55pub struct CVResult {
56    pub params: HashMap<String, f32>,
57    pub mean_score: f32,
58    pub std_score: f32,
59    pub scores: Vec<f32>,
60}
61
62/// Randomized Search Cross-Validation
63pub struct RandomizedSearchCV {
64    pub param_distributions: HashMap<String, ParamDistribution>,
65    pub n_iter: usize,
66    pub cv: usize,
67    pub scoring: Scoring,
68    pub random_state: Option<u64>,
69    pub refit: bool,
70    pub n_jobs: usize,
71    best_params_: Option<HashMap<String, f32>>,
72    best_score_: f32,
73    cv_results_: Vec<CVResult>,
74}
75
76#[derive(Clone, Copy)]
77pub enum Scoring {
78    Accuracy,
79    F1,
80    Precision,
81    Recall,
82    R2,
83    NegMSE,
84    NegMAE,
85}
86
87impl RandomizedSearchCV {
88    pub fn new(param_distributions: HashMap<String, ParamDistribution>, n_iter: usize) -> Self {
89        RandomizedSearchCV {
90            param_distributions,
91            n_iter,
92            cv: 5,
93            scoring: Scoring::Accuracy,
94            random_state: None,
95            refit: true,
96            n_jobs: 1,
97            best_params_: None,
98            best_score_: f32::NEG_INFINITY,
99            cv_results_: Vec::new(),
100        }
101    }
102
103    pub fn cv(mut self, cv: usize) -> Self { self.cv = cv; self }
104    pub fn scoring(mut self, s: Scoring) -> Self { self.scoring = s; self }
105    pub fn random_state(mut self, seed: u64) -> Self { self.random_state = Some(seed); self }
106
107    /// Run randomized search with a generic model
108    /// Returns the best parameters found
109    pub fn search<F>(&mut self, x: &Tensor, y: &Tensor, mut fit_and_score: F) -> RandomizedSearchResult
110    where
111        F: FnMut(&Tensor, &Tensor, &Tensor, &Tensor, &HashMap<String, f32>) -> f32,
112    {
113        let mut rng = match self.random_state {
114            Some(seed) => StdRng::seed_from_u64(seed),
115            None => StdRng::from_entropy(),
116        };
117
118        let n_samples = x.dims()[0];
119        let n_features = x.dims()[1];
120        let x_data = x.data_f32();
121        let y_data = y.data_f32();
122
123        self.cv_results_.clear();
124        self.best_score_ = f32::NEG_INFINITY;
125
126        for _ in 0..self.n_iter {
127            // Sample parameters
128            let params: HashMap<String, f32> = self.param_distributions.iter()
129                .map(|(name, dist)| (name.clone(), dist.sample(&mut rng)))
130                .collect();
131
132            // Cross-validation
133            let fold_size = n_samples / self.cv;
134            let mut scores = Vec::with_capacity(self.cv);
135
136            for fold in 0..self.cv {
137                let val_start = fold * fold_size;
138                let val_end = if fold == self.cv - 1 { n_samples } else { (fold + 1) * fold_size };
139
140                // Split data
141                let train_indices: Vec<usize> = (0..n_samples)
142                    .filter(|&i| i < val_start || i >= val_end)
143                    .collect();
144                let val_indices: Vec<usize> = (val_start..val_end).collect();
145
146                let x_train: Vec<f32> = train_indices.iter()
147                    .flat_map(|&i| x_data[i * n_features..(i + 1) * n_features].to_vec())
148                    .collect();
149                let y_train: Vec<f32> = train_indices.iter().map(|&i| y_data[i]).collect();
150
151                let x_val: Vec<f32> = val_indices.iter()
152                    .flat_map(|&i| x_data[i * n_features..(i + 1) * n_features].to_vec())
153                    .collect();
154                let y_val: Vec<f32> = val_indices.iter().map(|&i| y_data[i]).collect();
155
156                let x_train_t = Tensor::from_slice(&x_train, &[train_indices.len(), n_features]).unwrap();
157                let y_train_t = Tensor::from_slice(&y_train, &[train_indices.len()]).unwrap();
158                let x_val_t = Tensor::from_slice(&x_val, &[val_indices.len(), n_features]).unwrap();
159                let y_val_t = Tensor::from_slice(&y_val, &[val_indices.len()]).unwrap();
160
161                let score = fit_and_score(&x_train_t, &y_train_t, &x_val_t, &y_val_t, &params);
162                scores.push(score);
163            }
164
165            let mean_score = scores.iter().sum::<f32>() / scores.len() as f32;
166            let std_score = (scores.iter().map(|&s| (s - mean_score).powi(2)).sum::<f32>() 
167                / scores.len() as f32).sqrt();
168
169            let cv_result = CVResult {
170                params: params.clone(),
171                mean_score,
172                std_score,
173                scores,
174            };
175            self.cv_results_.push(cv_result);
176
177            if mean_score > self.best_score_ {
178                self.best_score_ = mean_score;
179                self.best_params_ = Some(params);
180            }
181        }
182
183        RandomizedSearchResult {
184            best_params: self.best_params_.clone().unwrap_or_default(),
185            best_score: self.best_score_,
186            cv_results: self.cv_results_.clone(),
187        }
188    }
189
190    pub fn best_params(&self) -> Option<&HashMap<String, f32>> {
191        self.best_params_.as_ref()
192    }
193
194    pub fn best_score(&self) -> f32 {
195        self.best_score_
196    }
197
198    pub fn cv_results(&self) -> &[CVResult] {
199        &self.cv_results_
200    }
201}
202
203/// Group K-Fold Cross-Validation
204pub struct GroupKFold {
205    pub n_splits: usize,
206}
207
208impl GroupKFold {
209    pub fn new(n_splits: usize) -> Self {
210        GroupKFold { n_splits }
211    }
212
213    /// Split data ensuring groups are not split across folds
214    pub fn split(&self, n_samples: usize, groups: &[usize]) -> Vec<(Vec<usize>, Vec<usize>)> {
215        // Find unique groups
216        let mut unique_groups: Vec<usize> = groups.to_vec();
217        unique_groups.sort();
218        unique_groups.dedup();
219
220        let n_groups = unique_groups.len();
221        let groups_per_fold = (n_groups + self.n_splits - 1) / self.n_splits;
222
223        let mut folds = Vec::with_capacity(self.n_splits);
224
225        for fold in 0..self.n_splits {
226            let fold_groups_start = fold * groups_per_fold;
227            let fold_groups_end = ((fold + 1) * groups_per_fold).min(n_groups);
228            let fold_groups: std::collections::HashSet<usize> = 
229                unique_groups[fold_groups_start..fold_groups_end].iter().cloned().collect();
230
231            let test_indices: Vec<usize> = (0..n_samples)
232                .filter(|&i| fold_groups.contains(&groups[i]))
233                .collect();
234            let train_indices: Vec<usize> = (0..n_samples)
235                .filter(|&i| !fold_groups.contains(&groups[i]))
236                .collect();
237
238            folds.push((train_indices, test_indices));
239        }
240
241        folds
242    }
243}
244
245/// Repeated K-Fold Cross-Validation
246pub struct RepeatedKFold {
247    pub n_splits: usize,
248    pub n_repeats: usize,
249    pub random_state: Option<u64>,
250}
251
252impl RepeatedKFold {
253    pub fn new(n_splits: usize, n_repeats: usize) -> Self {
254        RepeatedKFold {
255            n_splits,
256            n_repeats,
257            random_state: None,
258        }
259    }
260
261    pub fn random_state(mut self, seed: u64) -> Self {
262        self.random_state = Some(seed);
263        self
264    }
265
266    pub fn split(&self, n_samples: usize) -> Vec<(Vec<usize>, Vec<usize>)> {
267        let mut rng = match self.random_state {
268            Some(seed) => StdRng::seed_from_u64(seed),
269            None => StdRng::from_entropy(),
270        };
271
272        let mut all_folds = Vec::with_capacity(self.n_splits * self.n_repeats);
273
274        for _ in 0..self.n_repeats {
275            let mut indices: Vec<usize> = (0..n_samples).collect();
276            indices.shuffle(&mut rng);
277
278            let fold_size = n_samples / self.n_splits;
279
280            for fold in 0..self.n_splits {
281                let test_start = fold * fold_size;
282                let test_end = if fold == self.n_splits - 1 { n_samples } else { (fold + 1) * fold_size };
283
284                let test_indices: Vec<usize> = indices[test_start..test_end].to_vec();
285                let train_indices: Vec<usize> = indices[..test_start].iter()
286                    .chain(indices[test_end..].iter())
287                    .cloned()
288                    .collect();
289
290                all_folds.push((train_indices, test_indices));
291            }
292        }
293
294        all_folds
295    }
296
297    pub fn get_n_splits(&self) -> usize {
298        self.n_splits * self.n_repeats
299    }
300}
301
302/// Stratified Shuffle Split
303pub struct StratifiedShuffleSplit {
304    pub n_splits: usize,
305    pub test_size: f32,
306    pub random_state: Option<u64>,
307}
308
309impl StratifiedShuffleSplit {
310    pub fn new(n_splits: usize, test_size: f32) -> Self {
311        StratifiedShuffleSplit {
312            n_splits,
313            test_size,
314            random_state: None,
315        }
316    }
317
318    pub fn random_state(mut self, seed: u64) -> Self {
319        self.random_state = Some(seed);
320        self
321    }
322
323    pub fn split(&self, y: &[f32]) -> Vec<(Vec<usize>, Vec<usize>)> {
324        let mut rng = match self.random_state {
325            Some(seed) => StdRng::seed_from_u64(seed),
326            None => StdRng::from_entropy(),
327        };
328
329        let _n_samples = y.len();
330
331        // Group indices by class
332        let mut class_indices: HashMap<i32, Vec<usize>> = HashMap::new();
333        for (i, &label) in y.iter().enumerate() {
334            class_indices.entry(label as i32).or_default().push(i);
335        }
336
337        let mut all_splits = Vec::with_capacity(self.n_splits);
338
339        for _ in 0..self.n_splits {
340            let mut train_indices = Vec::new();
341            let mut test_indices = Vec::new();
342
343            for (_, indices) in &class_indices {
344                let mut shuffled = indices.clone();
345                shuffled.shuffle(&mut rng);
346
347                let n_test = (indices.len() as f32 * self.test_size).ceil() as usize;
348                let n_test = n_test.max(1).min(indices.len() - 1);
349
350                test_indices.extend_from_slice(&shuffled[..n_test]);
351                train_indices.extend_from_slice(&shuffled[n_test..]);
352            }
353
354            train_indices.shuffle(&mut rng);
355            test_indices.shuffle(&mut rng);
356
357            all_splits.push((train_indices, test_indices));
358        }
359
360        all_splits
361    }
362}
363
364/// Learning Curve - evaluate model performance with varying training set sizes
365pub fn learning_curve<F>(
366    x: &Tensor,
367    y: &Tensor,
368    train_sizes: &[f32],
369    cv: usize,
370    mut fit_and_score: F,
371) -> (Vec<usize>, Vec<f32>, Vec<f32>)
372where
373    F: FnMut(&Tensor, &Tensor, &Tensor, &Tensor) -> f32,
374{
375    let x_data = x.data_f32();
376    let y_data = y.data_f32();
377    let n_samples = x.dims()[0];
378    let n_features = x.dims()[1];
379
380    let mut sizes = Vec::new();
381    let mut train_scores = Vec::new();
382    let mut test_scores = Vec::new();
383
384    for &size_ratio in train_sizes {
385        let train_size = (n_samples as f32 * size_ratio) as usize;
386        if train_size < 2 { continue; }
387
388        let fold_size = n_samples / cv;
389        let mut fold_train_scores = Vec::new();
390        let mut fold_test_scores = Vec::new();
391
392        for fold in 0..cv {
393            let val_start = fold * fold_size;
394            let val_end = if fold == cv - 1 { n_samples } else { (fold + 1) * fold_size };
395
396            let all_train_indices: Vec<usize> = (0..n_samples)
397                .filter(|&i| i < val_start || i >= val_end)
398                .collect();
399            let val_indices: Vec<usize> = (val_start..val_end).collect();
400
401            // Use only train_size samples
402            let train_indices: Vec<usize> = all_train_indices.into_iter()
403                .take(train_size)
404                .collect();
405
406            if train_indices.is_empty() { continue; }
407
408            let x_train: Vec<f32> = train_indices.iter()
409                .flat_map(|&i| x_data[i * n_features..(i + 1) * n_features].to_vec())
410                .collect();
411            let y_train: Vec<f32> = train_indices.iter().map(|&i| y_data[i]).collect();
412
413            let x_val: Vec<f32> = val_indices.iter()
414                .flat_map(|&i| x_data[i * n_features..(i + 1) * n_features].to_vec())
415                .collect();
416            let y_val: Vec<f32> = val_indices.iter().map(|&i| y_data[i]).collect();
417
418            let x_train_t = Tensor::from_slice(&x_train, &[train_indices.len(), n_features]).unwrap();
419            let y_train_t = Tensor::from_slice(&y_train, &[train_indices.len()]).unwrap();
420            let x_val_t = Tensor::from_slice(&x_val, &[val_indices.len(), n_features]).unwrap();
421            let y_val_t = Tensor::from_slice(&y_val, &[val_indices.len()]).unwrap();
422
423            // Score on training data
424            let train_score = fit_and_score(&x_train_t, &y_train_t, &x_train_t, &y_train_t);
425            fold_train_scores.push(train_score);
426
427            // Score on validation data
428            let test_score = fit_and_score(&x_train_t, &y_train_t, &x_val_t, &y_val_t);
429            fold_test_scores.push(test_score);
430        }
431
432        if !fold_train_scores.is_empty() {
433            sizes.push(train_size);
434            train_scores.push(fold_train_scores.iter().sum::<f32>() / fold_train_scores.len() as f32);
435            test_scores.push(fold_test_scores.iter().sum::<f32>() / fold_test_scores.len() as f32);
436        }
437    }
438
439    (sizes, train_scores, test_scores)
440}
441
442/// Validation Curve - evaluate model performance with varying hyperparameter values
443pub fn validation_curve<F>(
444    x: &Tensor,
445    y: &Tensor,
446    param_values: &[f32],
447    cv: usize,
448    mut fit_and_score: F,
449) -> (Vec<f32>, Vec<f32>)
450where
451    F: FnMut(&Tensor, &Tensor, &Tensor, &Tensor, f32) -> f32,
452{
453    let x_data = x.data_f32();
454    let y_data = y.data_f32();
455    let n_samples = x.dims()[0];
456    let n_features = x.dims()[1];
457
458    let mut train_scores = Vec::new();
459    let mut test_scores = Vec::new();
460
461    for &param_value in param_values {
462        let fold_size = n_samples / cv;
463        let mut fold_train_scores = Vec::new();
464        let mut fold_test_scores = Vec::new();
465
466        for fold in 0..cv {
467            let val_start = fold * fold_size;
468            let val_end = if fold == cv - 1 { n_samples } else { (fold + 1) * fold_size };
469
470            let train_indices: Vec<usize> = (0..n_samples)
471                .filter(|&i| i < val_start || i >= val_end)
472                .collect();
473            let val_indices: Vec<usize> = (val_start..val_end).collect();
474
475            let x_train: Vec<f32> = train_indices.iter()
476                .flat_map(|&i| x_data[i * n_features..(i + 1) * n_features].to_vec())
477                .collect();
478            let y_train: Vec<f32> = train_indices.iter().map(|&i| y_data[i]).collect();
479
480            let x_val: Vec<f32> = val_indices.iter()
481                .flat_map(|&i| x_data[i * n_features..(i + 1) * n_features].to_vec())
482                .collect();
483            let y_val: Vec<f32> = val_indices.iter().map(|&i| y_data[i]).collect();
484
485            let x_train_t = Tensor::from_slice(&x_train, &[train_indices.len(), n_features]).unwrap();
486            let y_train_t = Tensor::from_slice(&y_train, &[train_indices.len()]).unwrap();
487            let x_val_t = Tensor::from_slice(&x_val, &[val_indices.len(), n_features]).unwrap();
488            let y_val_t = Tensor::from_slice(&y_val, &[val_indices.len()]).unwrap();
489
490            let train_score = fit_and_score(&x_train_t, &y_train_t, &x_train_t, &y_train_t, param_value);
491            fold_train_scores.push(train_score);
492
493            let test_score = fit_and_score(&x_train_t, &y_train_t, &x_val_t, &y_val_t, param_value);
494            fold_test_scores.push(test_score);
495        }
496
497        train_scores.push(fold_train_scores.iter().sum::<f32>() / fold_train_scores.len() as f32);
498        test_scores.push(fold_test_scores.iter().sum::<f32>() / fold_test_scores.len() as f32);
499    }
500
501    (train_scores, test_scores)
502}
503
504#[cfg(test)]
505mod tests {
506    use super::*;
507
508    #[test]
509    fn test_group_kfold() {
510        let groups = vec![0, 0, 1, 1, 2, 2, 3, 3];
511        let gkf = GroupKFold::new(2);
512        let splits = gkf.split(8, &groups);
513        
514        assert_eq!(splits.len(), 2);
515        for (train, test) in &splits {
516            assert!(!train.is_empty());
517            assert!(!test.is_empty());
518        }
519    }
520
521    #[test]
522    fn test_repeated_kfold() {
523        let rkf = RepeatedKFold::new(3, 2).random_state(42);
524        let splits = rkf.split(9);
525        
526        assert_eq!(splits.len(), 6); // 3 folds * 2 repeats
527    }
528
529    #[test]
530    fn test_stratified_shuffle_split() {
531        let y = vec![0.0, 0.0, 0.0, 1.0, 1.0, 1.0];
532        let sss = StratifiedShuffleSplit::new(3, 0.33).random_state(42);
533        let splits = sss.split(&y);
534        
535        assert_eq!(splits.len(), 3);
536    }
537}