1use ghostflow_core::Tensor;
4use rand::prelude::*;
5use std::collections::HashMap;
6
7#[derive(Clone)]
9pub enum ParamDistribution {
10 Uniform { low: f32, high: f32 },
12 LogUniform { low: f32, high: f32 },
14 IntUniform { low: i32, high: i32 },
16 Choice(Vec<f32>),
18 IntChoice(Vec<i32>),
20}
21
22impl ParamDistribution {
23 pub fn sample(&self, rng: &mut impl Rng) -> f32 {
24 match self {
25 ParamDistribution::Uniform { low, high } => {
26 rng.gen::<f32>() * (high - low) + low
27 }
28 ParamDistribution::LogUniform { low, high } => {
29 let log_low = low.ln();
30 let log_high = high.ln();
31 (rng.gen::<f32>() * (log_high - log_low) + log_low).exp()
32 }
33 ParamDistribution::IntUniform { low, high } => {
34 rng.gen_range(*low..=*high) as f32
35 }
36 ParamDistribution::Choice(values) => {
37 values[rng.gen_range(0..values.len())]
38 }
39 ParamDistribution::IntChoice(values) => {
40 values[rng.gen_range(0..values.len())] as f32
41 }
42 }
43 }
44}
45
46#[derive(Clone)]
48pub struct RandomizedSearchResult {
49 pub best_params: HashMap<String, f32>,
50 pub best_score: f32,
51 pub cv_results: Vec<CVResult>,
52}
53
54#[derive(Clone)]
55pub struct CVResult {
56 pub params: HashMap<String, f32>,
57 pub mean_score: f32,
58 pub std_score: f32,
59 pub scores: Vec<f32>,
60}
61
62pub struct RandomizedSearchCV {
64 pub param_distributions: HashMap<String, ParamDistribution>,
65 pub n_iter: usize,
66 pub cv: usize,
67 pub scoring: Scoring,
68 pub random_state: Option<u64>,
69 pub refit: bool,
70 pub n_jobs: usize,
71 best_params_: Option<HashMap<String, f32>>,
72 best_score_: f32,
73 cv_results_: Vec<CVResult>,
74}
75
76#[derive(Clone, Copy)]
77pub enum Scoring {
78 Accuracy,
79 F1,
80 Precision,
81 Recall,
82 R2,
83 NegMSE,
84 NegMAE,
85}
86
87impl RandomizedSearchCV {
88 pub fn new(param_distributions: HashMap<String, ParamDistribution>, n_iter: usize) -> Self {
89 RandomizedSearchCV {
90 param_distributions,
91 n_iter,
92 cv: 5,
93 scoring: Scoring::Accuracy,
94 random_state: None,
95 refit: true,
96 n_jobs: 1,
97 best_params_: None,
98 best_score_: f32::NEG_INFINITY,
99 cv_results_: Vec::new(),
100 }
101 }
102
103 pub fn cv(mut self, cv: usize) -> Self { self.cv = cv; self }
104 pub fn scoring(mut self, s: Scoring) -> Self { self.scoring = s; self }
105 pub fn random_state(mut self, seed: u64) -> Self { self.random_state = Some(seed); self }
106
107 pub fn search<F>(&mut self, x: &Tensor, y: &Tensor, mut fit_and_score: F) -> RandomizedSearchResult
110 where
111 F: FnMut(&Tensor, &Tensor, &Tensor, &Tensor, &HashMap<String, f32>) -> f32,
112 {
113 let mut rng = match self.random_state {
114 Some(seed) => StdRng::seed_from_u64(seed),
115 None => StdRng::from_entropy(),
116 };
117
118 let n_samples = x.dims()[0];
119 let n_features = x.dims()[1];
120 let x_data = x.data_f32();
121 let y_data = y.data_f32();
122
123 self.cv_results_.clear();
124 self.best_score_ = f32::NEG_INFINITY;
125
126 for _ in 0..self.n_iter {
127 let params: HashMap<String, f32> = self.param_distributions.iter()
129 .map(|(name, dist)| (name.clone(), dist.sample(&mut rng)))
130 .collect();
131
132 let fold_size = n_samples / self.cv;
134 let mut scores = Vec::with_capacity(self.cv);
135
136 for fold in 0..self.cv {
137 let val_start = fold * fold_size;
138 let val_end = if fold == self.cv - 1 { n_samples } else { (fold + 1) * fold_size };
139
140 let train_indices: Vec<usize> = (0..n_samples)
142 .filter(|&i| i < val_start || i >= val_end)
143 .collect();
144 let val_indices: Vec<usize> = (val_start..val_end).collect();
145
146 let x_train: Vec<f32> = train_indices.iter()
147 .flat_map(|&i| x_data[i * n_features..(i + 1) * n_features].to_vec())
148 .collect();
149 let y_train: Vec<f32> = train_indices.iter().map(|&i| y_data[i]).collect();
150
151 let x_val: Vec<f32> = val_indices.iter()
152 .flat_map(|&i| x_data[i * n_features..(i + 1) * n_features].to_vec())
153 .collect();
154 let y_val: Vec<f32> = val_indices.iter().map(|&i| y_data[i]).collect();
155
156 let x_train_t = Tensor::from_slice(&x_train, &[train_indices.len(), n_features]).unwrap();
157 let y_train_t = Tensor::from_slice(&y_train, &[train_indices.len()]).unwrap();
158 let x_val_t = Tensor::from_slice(&x_val, &[val_indices.len(), n_features]).unwrap();
159 let y_val_t = Tensor::from_slice(&y_val, &[val_indices.len()]).unwrap();
160
161 let score = fit_and_score(&x_train_t, &y_train_t, &x_val_t, &y_val_t, ¶ms);
162 scores.push(score);
163 }
164
165 let mean_score = scores.iter().sum::<f32>() / scores.len() as f32;
166 let std_score = (scores.iter().map(|&s| (s - mean_score).powi(2)).sum::<f32>()
167 / scores.len() as f32).sqrt();
168
169 let cv_result = CVResult {
170 params: params.clone(),
171 mean_score,
172 std_score,
173 scores,
174 };
175 self.cv_results_.push(cv_result);
176
177 if mean_score > self.best_score_ {
178 self.best_score_ = mean_score;
179 self.best_params_ = Some(params);
180 }
181 }
182
183 RandomizedSearchResult {
184 best_params: self.best_params_.clone().unwrap_or_default(),
185 best_score: self.best_score_,
186 cv_results: self.cv_results_.clone(),
187 }
188 }
189
190 pub fn best_params(&self) -> Option<&HashMap<String, f32>> {
191 self.best_params_.as_ref()
192 }
193
194 pub fn best_score(&self) -> f32 {
195 self.best_score_
196 }
197
198 pub fn cv_results(&self) -> &[CVResult] {
199 &self.cv_results_
200 }
201}
202
203pub struct GroupKFold {
205 pub n_splits: usize,
206}
207
208impl GroupKFold {
209 pub fn new(n_splits: usize) -> Self {
210 GroupKFold { n_splits }
211 }
212
213 pub fn split(&self, n_samples: usize, groups: &[usize]) -> Vec<(Vec<usize>, Vec<usize>)> {
215 let mut unique_groups: Vec<usize> = groups.to_vec();
217 unique_groups.sort();
218 unique_groups.dedup();
219
220 let n_groups = unique_groups.len();
221 let groups_per_fold = (n_groups + self.n_splits - 1) / self.n_splits;
222
223 let mut folds = Vec::with_capacity(self.n_splits);
224
225 for fold in 0..self.n_splits {
226 let fold_groups_start = fold * groups_per_fold;
227 let fold_groups_end = ((fold + 1) * groups_per_fold).min(n_groups);
228 let fold_groups: std::collections::HashSet<usize> =
229 unique_groups[fold_groups_start..fold_groups_end].iter().cloned().collect();
230
231 let test_indices: Vec<usize> = (0..n_samples)
232 .filter(|&i| fold_groups.contains(&groups[i]))
233 .collect();
234 let train_indices: Vec<usize> = (0..n_samples)
235 .filter(|&i| !fold_groups.contains(&groups[i]))
236 .collect();
237
238 folds.push((train_indices, test_indices));
239 }
240
241 folds
242 }
243}
244
245pub struct RepeatedKFold {
247 pub n_splits: usize,
248 pub n_repeats: usize,
249 pub random_state: Option<u64>,
250}
251
252impl RepeatedKFold {
253 pub fn new(n_splits: usize, n_repeats: usize) -> Self {
254 RepeatedKFold {
255 n_splits,
256 n_repeats,
257 random_state: None,
258 }
259 }
260
261 pub fn random_state(mut self, seed: u64) -> Self {
262 self.random_state = Some(seed);
263 self
264 }
265
266 pub fn split(&self, n_samples: usize) -> Vec<(Vec<usize>, Vec<usize>)> {
267 let mut rng = match self.random_state {
268 Some(seed) => StdRng::seed_from_u64(seed),
269 None => StdRng::from_entropy(),
270 };
271
272 let mut all_folds = Vec::with_capacity(self.n_splits * self.n_repeats);
273
274 for _ in 0..self.n_repeats {
275 let mut indices: Vec<usize> = (0..n_samples).collect();
276 indices.shuffle(&mut rng);
277
278 let fold_size = n_samples / self.n_splits;
279
280 for fold in 0..self.n_splits {
281 let test_start = fold * fold_size;
282 let test_end = if fold == self.n_splits - 1 { n_samples } else { (fold + 1) * fold_size };
283
284 let test_indices: Vec<usize> = indices[test_start..test_end].to_vec();
285 let train_indices: Vec<usize> = indices[..test_start].iter()
286 .chain(indices[test_end..].iter())
287 .cloned()
288 .collect();
289
290 all_folds.push((train_indices, test_indices));
291 }
292 }
293
294 all_folds
295 }
296
297 pub fn get_n_splits(&self) -> usize {
298 self.n_splits * self.n_repeats
299 }
300}
301
302pub struct StratifiedShuffleSplit {
304 pub n_splits: usize,
305 pub test_size: f32,
306 pub random_state: Option<u64>,
307}
308
309impl StratifiedShuffleSplit {
310 pub fn new(n_splits: usize, test_size: f32) -> Self {
311 StratifiedShuffleSplit {
312 n_splits,
313 test_size,
314 random_state: None,
315 }
316 }
317
318 pub fn random_state(mut self, seed: u64) -> Self {
319 self.random_state = Some(seed);
320 self
321 }
322
323 pub fn split(&self, y: &[f32]) -> Vec<(Vec<usize>, Vec<usize>)> {
324 let mut rng = match self.random_state {
325 Some(seed) => StdRng::seed_from_u64(seed),
326 None => StdRng::from_entropy(),
327 };
328
329 let _n_samples = y.len();
330
331 let mut class_indices: HashMap<i32, Vec<usize>> = HashMap::new();
333 for (i, &label) in y.iter().enumerate() {
334 class_indices.entry(label as i32).or_default().push(i);
335 }
336
337 let mut all_splits = Vec::with_capacity(self.n_splits);
338
339 for _ in 0..self.n_splits {
340 let mut train_indices = Vec::new();
341 let mut test_indices = Vec::new();
342
343 for (_, indices) in &class_indices {
344 let mut shuffled = indices.clone();
345 shuffled.shuffle(&mut rng);
346
347 let n_test = (indices.len() as f32 * self.test_size).ceil() as usize;
348 let n_test = n_test.max(1).min(indices.len() - 1);
349
350 test_indices.extend_from_slice(&shuffled[..n_test]);
351 train_indices.extend_from_slice(&shuffled[n_test..]);
352 }
353
354 train_indices.shuffle(&mut rng);
355 test_indices.shuffle(&mut rng);
356
357 all_splits.push((train_indices, test_indices));
358 }
359
360 all_splits
361 }
362}
363
364pub fn learning_curve<F>(
366 x: &Tensor,
367 y: &Tensor,
368 train_sizes: &[f32],
369 cv: usize,
370 mut fit_and_score: F,
371) -> (Vec<usize>, Vec<f32>, Vec<f32>)
372where
373 F: FnMut(&Tensor, &Tensor, &Tensor, &Tensor) -> f32,
374{
375 let x_data = x.data_f32();
376 let y_data = y.data_f32();
377 let n_samples = x.dims()[0];
378 let n_features = x.dims()[1];
379
380 let mut sizes = Vec::new();
381 let mut train_scores = Vec::new();
382 let mut test_scores = Vec::new();
383
384 for &size_ratio in train_sizes {
385 let train_size = (n_samples as f32 * size_ratio) as usize;
386 if train_size < 2 { continue; }
387
388 let fold_size = n_samples / cv;
389 let mut fold_train_scores = Vec::new();
390 let mut fold_test_scores = Vec::new();
391
392 for fold in 0..cv {
393 let val_start = fold * fold_size;
394 let val_end = if fold == cv - 1 { n_samples } else { (fold + 1) * fold_size };
395
396 let all_train_indices: Vec<usize> = (0..n_samples)
397 .filter(|&i| i < val_start || i >= val_end)
398 .collect();
399 let val_indices: Vec<usize> = (val_start..val_end).collect();
400
401 let train_indices: Vec<usize> = all_train_indices.into_iter()
403 .take(train_size)
404 .collect();
405
406 if train_indices.is_empty() { continue; }
407
408 let x_train: Vec<f32> = train_indices.iter()
409 .flat_map(|&i| x_data[i * n_features..(i + 1) * n_features].to_vec())
410 .collect();
411 let y_train: Vec<f32> = train_indices.iter().map(|&i| y_data[i]).collect();
412
413 let x_val: Vec<f32> = val_indices.iter()
414 .flat_map(|&i| x_data[i * n_features..(i + 1) * n_features].to_vec())
415 .collect();
416 let y_val: Vec<f32> = val_indices.iter().map(|&i| y_data[i]).collect();
417
418 let x_train_t = Tensor::from_slice(&x_train, &[train_indices.len(), n_features]).unwrap();
419 let y_train_t = Tensor::from_slice(&y_train, &[train_indices.len()]).unwrap();
420 let x_val_t = Tensor::from_slice(&x_val, &[val_indices.len(), n_features]).unwrap();
421 let y_val_t = Tensor::from_slice(&y_val, &[val_indices.len()]).unwrap();
422
423 let train_score = fit_and_score(&x_train_t, &y_train_t, &x_train_t, &y_train_t);
425 fold_train_scores.push(train_score);
426
427 let test_score = fit_and_score(&x_train_t, &y_train_t, &x_val_t, &y_val_t);
429 fold_test_scores.push(test_score);
430 }
431
432 if !fold_train_scores.is_empty() {
433 sizes.push(train_size);
434 train_scores.push(fold_train_scores.iter().sum::<f32>() / fold_train_scores.len() as f32);
435 test_scores.push(fold_test_scores.iter().sum::<f32>() / fold_test_scores.len() as f32);
436 }
437 }
438
439 (sizes, train_scores, test_scores)
440}
441
442pub fn validation_curve<F>(
444 x: &Tensor,
445 y: &Tensor,
446 param_values: &[f32],
447 cv: usize,
448 mut fit_and_score: F,
449) -> (Vec<f32>, Vec<f32>)
450where
451 F: FnMut(&Tensor, &Tensor, &Tensor, &Tensor, f32) -> f32,
452{
453 let x_data = x.data_f32();
454 let y_data = y.data_f32();
455 let n_samples = x.dims()[0];
456 let n_features = x.dims()[1];
457
458 let mut train_scores = Vec::new();
459 let mut test_scores = Vec::new();
460
461 for ¶m_value in param_values {
462 let fold_size = n_samples / cv;
463 let mut fold_train_scores = Vec::new();
464 let mut fold_test_scores = Vec::new();
465
466 for fold in 0..cv {
467 let val_start = fold * fold_size;
468 let val_end = if fold == cv - 1 { n_samples } else { (fold + 1) * fold_size };
469
470 let train_indices: Vec<usize> = (0..n_samples)
471 .filter(|&i| i < val_start || i >= val_end)
472 .collect();
473 let val_indices: Vec<usize> = (val_start..val_end).collect();
474
475 let x_train: Vec<f32> = train_indices.iter()
476 .flat_map(|&i| x_data[i * n_features..(i + 1) * n_features].to_vec())
477 .collect();
478 let y_train: Vec<f32> = train_indices.iter().map(|&i| y_data[i]).collect();
479
480 let x_val: Vec<f32> = val_indices.iter()
481 .flat_map(|&i| x_data[i * n_features..(i + 1) * n_features].to_vec())
482 .collect();
483 let y_val: Vec<f32> = val_indices.iter().map(|&i| y_data[i]).collect();
484
485 let x_train_t = Tensor::from_slice(&x_train, &[train_indices.len(), n_features]).unwrap();
486 let y_train_t = Tensor::from_slice(&y_train, &[train_indices.len()]).unwrap();
487 let x_val_t = Tensor::from_slice(&x_val, &[val_indices.len(), n_features]).unwrap();
488 let y_val_t = Tensor::from_slice(&y_val, &[val_indices.len()]).unwrap();
489
490 let train_score = fit_and_score(&x_train_t, &y_train_t, &x_train_t, &y_train_t, param_value);
491 fold_train_scores.push(train_score);
492
493 let test_score = fit_and_score(&x_train_t, &y_train_t, &x_val_t, &y_val_t, param_value);
494 fold_test_scores.push(test_score);
495 }
496
497 train_scores.push(fold_train_scores.iter().sum::<f32>() / fold_train_scores.len() as f32);
498 test_scores.push(fold_test_scores.iter().sum::<f32>() / fold_test_scores.len() as f32);
499 }
500
501 (train_scores, test_scores)
502}
503
504#[cfg(test)]
505mod tests {
506 use super::*;
507
508 #[test]
509 fn test_group_kfold() {
510 let groups = vec![0, 0, 1, 1, 2, 2, 3, 3];
511 let gkf = GroupKFold::new(2);
512 let splits = gkf.split(8, &groups);
513
514 assert_eq!(splits.len(), 2);
515 for (train, test) in &splits {
516 assert!(!train.is_empty());
517 assert!(!test.is_empty());
518 }
519 }
520
521 #[test]
522 fn test_repeated_kfold() {
523 let rkf = RepeatedKFold::new(3, 2).random_state(42);
524 let splits = rkf.split(9);
525
526 assert_eq!(splits.len(), 6); }
528
529 #[test]
530 fn test_stratified_shuffle_split() {
531 let y = vec![0.0, 0.0, 0.0, 1.0, 1.0, 1.0];
532 let sss = StratifiedShuffleSplit::new(3, 0.33).random_state(42);
533 let splits = sss.split(&y);
534
535 assert_eq!(splits.len(), 3);
536 }
537}
538
539