1use scirs2_core::ndarray::{s, Array1, Array2};
14use scirs2_core::random::{Rng, SeedableRng};
15use sklears_core::{error::SklearsError, traits::Estimator, traits::Fit, traits::Predict};
16use std::collections::HashMap;
17
18#[derive(Debug, Clone)]
20pub enum BenchmarkStrategy {
21 ZeroRule,
23 OneRule,
25 RandomStumps { n_stumps: usize },
27 MajorityClassTieBreak,
29 WeightedRandom,
31 LinearTrend,
33 MovingAverage { window_size: usize },
35 NearestNeighbor,
37 CompetitionBaseline,
39}
40
41#[derive(Debug, Clone)]
43pub enum DomainStrategy {
44 PixelIntensity,
46 BagOfWords,
48 SeasonalDecomposition { period: usize },
50 PopularityBaseline,
52 IsolationThreshold { contamination: f64 },
54}
55
56#[derive(Debug, Clone)]
58pub enum TheoreticalBound {
59 BayesError,
61 RandomChance,
63 InformationBound,
65 StatisticalBound,
67}
68
69#[derive(Debug, Clone)]
71pub struct BenchmarkClassifier {
72 strategy: BenchmarkStrategy,
73 random_state: Option<u64>,
74}
75
76#[derive(Debug, Clone)]
78pub struct TrainedBenchmarkClassifier {
79 strategy: BenchmarkStrategy,
80 classes: Vec<i32>,
81 class_counts: HashMap<i32, usize>,
82 feature_rules: Option<Vec<(usize, f64, i32)>>, training_data: Option<(Array2<f64>, Array1<i32>)>, random_state: Option<u64>,
85}
86
87impl BenchmarkClassifier {
88 pub fn new(strategy: BenchmarkStrategy) -> Self {
90 Self {
91 strategy,
92 random_state: None,
93 }
94 }
95
96 pub fn with_random_state(mut self, seed: u64) -> Self {
98 self.random_state = Some(seed);
99 self
100 }
101}
102
103impl Estimator for BenchmarkClassifier {
104 type Config = BenchmarkStrategy;
105 type Error = SklearsError;
106 type Float = f64;
107
108 fn config(&self) -> &Self::Config {
109 &self.strategy
110 }
111}
112
113impl Fit<Array2<f64>, Array1<i32>> for BenchmarkClassifier {
114 type Fitted = TrainedBenchmarkClassifier;
115
116 fn fit(self, x: &Array2<f64>, y: &Array1<i32>) -> Result<Self::Fitted, SklearsError> {
117 let mut class_counts = HashMap::new();
118 for &class in y.iter() {
119 *class_counts.entry(class).or_insert(0) += 1;
120 }
121
122 let mut classes: Vec<_> = class_counts.keys().cloned().collect();
123 classes.sort();
124
125 let feature_rules = match &self.strategy {
126 BenchmarkStrategy::OneRule => Some(Self::build_one_rule(x, y)?),
127 BenchmarkStrategy::RandomStumps { n_stumps } => Some(Self::build_random_stumps(
128 x,
129 y,
130 *n_stumps,
131 self.random_state,
132 )?),
133 _ => None,
134 };
135
136 let training_data = match &self.strategy {
137 BenchmarkStrategy::NearestNeighbor => Some((x.clone(), y.clone())),
138 _ => None,
139 };
140
141 Ok(TrainedBenchmarkClassifier {
142 strategy: self.strategy,
143 classes,
144 class_counts,
145 feature_rules,
146 training_data,
147 random_state: self.random_state,
148 })
149 }
150}
151
152impl BenchmarkClassifier {
153 fn build_one_rule(
154 x: &Array2<f64>,
155 y: &Array1<i32>,
156 ) -> Result<Vec<(usize, f64, i32)>, SklearsError> {
157 let n_features = x.ncols();
158 let mut best_accuracy = 0.0;
159 let mut best_rule = None;
160
161 for feature_idx in 0..n_features {
162 let feature_values = x.column(feature_idx);
163
164 let mut values: Vec<_> = feature_values.iter().cloned().collect();
166 values.sort_by(|a, b| a.partial_cmp(b).unwrap());
167
168 for i in 0..values.len() - 1 {
169 let threshold = (values[i] + values[i + 1]) / 2.0;
170
171 for &(pred_below, pred_above) in &[(0, 1), (1, 0)] {
173 let mut correct = 0;
174 for (j, &actual) in y.iter().enumerate() {
175 let predicted = if feature_values[j] <= threshold {
176 pred_below
177 } else {
178 pred_above
179 };
180 if predicted == actual {
181 correct += 1;
182 }
183 }
184
185 let accuracy = correct as f64 / y.len() as f64;
186 if accuracy > best_accuracy {
187 best_accuracy = accuracy;
188 best_rule = Some((feature_idx, threshold, pred_below));
189 }
190 }
191 }
192 }
193
194 Ok(vec![best_rule.unwrap_or((0, 0.0, 0))])
195 }
196
197 fn build_random_stumps(
198 x: &Array2<f64>,
199 y: &Array1<i32>,
200 n_stumps: usize,
201 random_state: Option<u64>,
202 ) -> Result<Vec<(usize, f64, i32)>, SklearsError> {
203 let mut rng = if let Some(seed) = random_state {
204 scirs2_core::random::rngs::StdRng::seed_from_u64(seed)
205 } else {
206 scirs2_core::random::rngs::StdRng::seed_from_u64(0)
207 };
208
209 let n_features = x.ncols();
210 let mut stumps = Vec::new();
211
212 for _ in 0..n_stumps {
213 let feature_idx = rng.gen_range(0..n_features);
214 let feature_values = x.column(feature_idx);
215
216 let min_val = feature_values.iter().fold(f64::INFINITY, |a, &b| a.min(b));
217 let max_val = feature_values
218 .iter()
219 .fold(f64::NEG_INFINITY, |a, &b| a.max(b));
220
221 let threshold = rng.gen_range(min_val..max_val + 1.0);
222 let prediction = rng.gen_range(0..2);
223
224 stumps.push((feature_idx, threshold, prediction));
225 }
226
227 Ok(stumps)
228 }
229}
230
231impl Predict<Array2<f64>, Array1<i32>> for TrainedBenchmarkClassifier {
232 fn predict(&self, x: &Array2<f64>) -> Result<Array1<i32>, SklearsError> {
233 let n_samples = x.nrows();
234 let mut predictions = Array1::zeros(n_samples);
235
236 match &self.strategy {
237 BenchmarkStrategy::ZeroRule => {
238 let most_common = self
240 .class_counts
241 .iter()
242 .max_by_key(|(_, &count)| count)
243 .map(|(&class, _)| class)
244 .unwrap_or(0);
245 predictions.fill(most_common);
246 }
247
248 BenchmarkStrategy::MajorityClassTieBreak => {
249 let most_common = self
250 .class_counts
251 .iter()
252 .max_by_key(|(_, &count)| count)
253 .map(|(&class, _)| class)
254 .unwrap_or(0);
255 predictions.fill(most_common);
256 }
257
258 BenchmarkStrategy::WeightedRandom => {
259 let mut rng = if let Some(seed) = self.random_state {
260 scirs2_core::random::rngs::StdRng::seed_from_u64(seed)
261 } else {
262 scirs2_core::random::rngs::StdRng::seed_from_u64(0)
263 };
264
265 let total_count: usize = self.class_counts.values().sum();
266 for i in 0..n_samples {
267 let rand_val = rng.gen_range(0..total_count);
268 let mut cumsum = 0;
269 for (&class, &count) in &self.class_counts {
270 cumsum += count;
271 if rand_val < cumsum {
272 predictions[i] = class;
273 break;
274 }
275 }
276 }
277 }
278
279 BenchmarkStrategy::OneRule => {
280 if let Some(rules) = &self.feature_rules {
281 if let Some((feature_idx, threshold, prediction)) = rules.first() {
282 for i in 0..n_samples {
283 predictions[i] = if x[[i, *feature_idx]] <= *threshold {
284 *prediction
285 } else {
286 1 - *prediction
287 };
288 }
289 }
290 }
291 }
292
293 BenchmarkStrategy::RandomStumps { .. } => {
294 if let Some(rules) = &self.feature_rules {
295 for i in 0..n_samples {
296 let mut votes = HashMap::new();
297 for (feature_idx, threshold, prediction) in rules {
298 let vote = if x[[i, *feature_idx]] <= *threshold {
299 *prediction
300 } else {
301 1 - *prediction
302 };
303 *votes.entry(vote).or_insert(0) += 1;
304 }
305 predictions[i] = votes
306 .into_iter()
307 .max_by_key(|(_, count)| *count)
308 .map(|(class, _)| class)
309 .unwrap_or(0);
310 }
311 }
312 }
313
314 BenchmarkStrategy::NearestNeighbor => {
315 if let Some((train_x, train_y)) = &self.training_data {
316 for i in 0..n_samples {
317 let test_point = x.row(i);
318 let mut min_distance = f64::INFINITY;
319 let mut nearest_class = 0;
320
321 for j in 0..train_x.nrows() {
322 let train_point = train_x.row(j);
323 let distance: f64 = test_point
324 .iter()
325 .zip(train_point.iter())
326 .map(|(a, b)| (a - b).powi(2))
327 .sum::<f64>()
328 .sqrt();
329
330 if distance < min_distance {
331 min_distance = distance;
332 nearest_class = train_y[j];
333 }
334 }
335 predictions[i] = nearest_class;
336 }
337 }
338 }
339
340 BenchmarkStrategy::CompetitionBaseline => {
341 let zr_pred = self
343 .class_counts
344 .iter()
345 .max_by_key(|(_, &count)| count)
346 .map(|(&class, _)| class)
347 .unwrap_or(0);
348 predictions.fill(zr_pred);
349 }
350
351 _ => {
352 let most_common = self
354 .class_counts
355 .iter()
356 .max_by_key(|(_, &count)| count)
357 .map(|(&class, _)| class)
358 .unwrap_or(0);
359 predictions.fill(most_common);
360 }
361 }
362
363 Ok(predictions)
364 }
365}
366
367#[derive(Debug, Clone)]
369pub struct BenchmarkRegressor {
370 strategy: BenchmarkStrategy,
371 random_state: Option<u64>,
372}
373
374#[derive(Debug, Clone)]
376pub struct TrainedBenchmarkRegressor {
377 strategy: BenchmarkStrategy,
378 mean_value: f64,
379 median_value: f64,
380 training_data: Option<(Array2<f64>, Array1<f64>)>,
381 trend_coefficients: Option<(f64, f64)>, moving_avg_values: Option<Array1<f64>>,
383 random_state: Option<u64>,
384}
385
386impl BenchmarkRegressor {
387 pub fn new(strategy: BenchmarkStrategy) -> Self {
389 Self {
390 strategy,
391 random_state: None,
392 }
393 }
394
395 pub fn with_random_state(mut self, seed: u64) -> Self {
397 self.random_state = Some(seed);
398 self
399 }
400}
401
402impl Estimator for BenchmarkRegressor {
403 type Config = BenchmarkStrategy;
404 type Error = SklearsError;
405 type Float = f64;
406
407 fn config(&self) -> &Self::Config {
408 &self.strategy
409 }
410}
411
412impl Fit<Array2<f64>, Array1<f64>> for BenchmarkRegressor {
413 type Fitted = TrainedBenchmarkRegressor;
414
415 fn fit(self, x: &Array2<f64>, y: &Array1<f64>) -> Result<Self::Fitted, SklearsError> {
416 let mean_value = y.mean().unwrap_or(0.0);
417
418 let mut sorted_y = y.to_vec();
419 sorted_y.sort_by(|a, b| a.partial_cmp(b).unwrap());
420 let median_value = if sorted_y.len() % 2 == 0 {
421 let mid = sorted_y.len() / 2;
422 (sorted_y[mid - 1] + sorted_y[mid]) / 2.0
423 } else {
424 sorted_y[sorted_y.len() / 2]
425 };
426
427 let training_data = match &self.strategy {
428 BenchmarkStrategy::NearestNeighbor => Some((x.clone(), y.clone())),
429 _ => None,
430 };
431
432 let trend_coefficients = match &self.strategy {
433 BenchmarkStrategy::LinearTrend => {
434 let n = y.len() as f64;
436 let sum_x = (0..y.len()).sum::<usize>() as f64;
437 let sum_y = y.sum();
438 let sum_xy = y
439 .iter()
440 .enumerate()
441 .map(|(i, &yi)| i as f64 * yi)
442 .sum::<f64>();
443 let sum_x2 = (0..y.len()).map(|i| (i as f64).powi(2)).sum::<f64>();
444
445 let slope = (n * sum_xy - sum_x * sum_y) / (n * sum_x2 - sum_x.powi(2));
446 let intercept = (sum_y - slope * sum_x) / n;
447 Some((slope, intercept))
448 }
449 _ => None,
450 };
451
452 let moving_avg_values = match &self.strategy {
453 BenchmarkStrategy::MovingAverage { window_size } => {
454 let mut values = Vec::new();
455 for i in 0..y.len() {
456 let start = i.saturating_sub(*window_size);
457 let window_mean = y.slice(s![start..=i]).mean().unwrap_or(0.0);
458 values.push(window_mean);
459 }
460 Some(Array1::from(values))
461 }
462 _ => None,
463 };
464
465 Ok(TrainedBenchmarkRegressor {
466 strategy: self.strategy,
467 mean_value,
468 median_value,
469 training_data,
470 trend_coefficients,
471 moving_avg_values,
472 random_state: self.random_state,
473 })
474 }
475}
476
477impl Predict<Array2<f64>, Array1<f64>> for TrainedBenchmarkRegressor {
478 fn predict(&self, x: &Array2<f64>) -> Result<Array1<f64>, SklearsError> {
479 let n_samples = x.nrows();
480 let mut predictions = Array1::zeros(n_samples);
481
482 match &self.strategy {
483 BenchmarkStrategy::ZeroRule => {
484 predictions.fill(self.mean_value);
485 }
486
487 BenchmarkStrategy::LinearTrend => {
488 if let Some((slope, intercept)) = self.trend_coefficients {
489 for i in 0..n_samples {
490 predictions[i] = slope * i as f64 + intercept;
491 }
492 } else {
493 predictions.fill(self.mean_value);
494 }
495 }
496
497 BenchmarkStrategy::MovingAverage { .. } => {
498 if let Some(ref values) = self.moving_avg_values {
499 let last_value = values.last().copied().unwrap_or(self.mean_value);
500 predictions.fill(last_value);
501 } else {
502 predictions.fill(self.mean_value);
503 }
504 }
505
506 BenchmarkStrategy::NearestNeighbor => {
507 if let Some((train_x, train_y)) = &self.training_data {
508 for i in 0..n_samples {
509 let test_point = x.row(i);
510 let mut min_distance = f64::INFINITY;
511 let mut nearest_value = self.mean_value;
512
513 for j in 0..train_x.nrows() {
514 let train_point = train_x.row(j);
515 let distance: f64 = test_point
516 .iter()
517 .zip(train_point.iter())
518 .map(|(a, b)| (a - b).powi(2))
519 .sum::<f64>()
520 .sqrt();
521
522 if distance < min_distance {
523 min_distance = distance;
524 nearest_value = train_y[j];
525 }
526 }
527 predictions[i] = nearest_value;
528 }
529 } else {
530 predictions.fill(self.mean_value);
531 }
532 }
533
534 _ => {
535 predictions.fill(self.mean_value);
536 }
537 }
538
539 Ok(predictions)
540 }
541}
542
543#[derive(Debug, Clone)]
545pub struct DomainBenchmarkClassifier {
546 strategy: DomainStrategy,
547 random_state: Option<u64>,
548}
549
550pub struct CompetitionBaseline;
552
553impl CompetitionBaseline {
554 pub fn classifier() -> BenchmarkClassifier {
556 BenchmarkClassifier::new(BenchmarkStrategy::CompetitionBaseline)
557 }
558
559 pub fn regressor() -> BenchmarkRegressor {
561 BenchmarkRegressor::new(BenchmarkStrategy::ZeroRule)
562 }
563
564 pub fn ensemble_strategies() -> Vec<BenchmarkStrategy> {
566 vec![
567 BenchmarkStrategy::ZeroRule,
568 BenchmarkStrategy::OneRule,
569 BenchmarkStrategy::WeightedRandom,
570 BenchmarkStrategy::NearestNeighbor,
571 ]
572 }
573}
574
575pub struct TheoreticalBounds;
577
578impl TheoreticalBounds {
579 pub fn classification_bound(y: &Array1<i32>) -> f64 {
581 let mut class_counts = HashMap::new();
582 for &class in y.iter() {
583 *class_counts.entry(class).or_insert(0) += 1;
584 }
585
586 let total = y.len() as f64;
587 let max_count = class_counts.values().max().copied().unwrap_or(0) as f64;
588 max_count / total
589 }
590
591 pub fn random_chance_classification(n_classes: usize) -> f64 {
593 1.0 / n_classes as f64
594 }
595
596 pub fn regression_bound(y: &Array1<f64>) -> f64 {
598 let mean = y.mean().unwrap_or(0.0);
599 let variance = y.iter().map(|&yi| (yi - mean).powi(2)).sum::<f64>() / y.len() as f64;
600 variance.sqrt() }
602}
603
604#[allow(non_snake_case)]
605#[cfg(test)]
606mod tests {
607 use super::*;
608 use scirs2_core::ndarray::array;
609
610 #[test]
611 fn test_zero_rule_classifier() {
612 let x =
613 Array2::from_shape_vec((4, 2), vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]).unwrap();
614 let y = array![0, 0, 1, 0]; let classifier = BenchmarkClassifier::new(BenchmarkStrategy::ZeroRule);
617 let fitted = classifier.fit(&x, &y).unwrap();
618 let predictions = fitted.predict(&x).unwrap();
619
620 assert_eq!(predictions, array![0, 0, 0, 0]);
621 }
622
623 #[test]
624 fn test_one_rule_classifier() {
625 let x =
626 Array2::from_shape_vec((4, 2), vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]).unwrap();
627 let y = array![0, 0, 1, 1];
628
629 let classifier = BenchmarkClassifier::new(BenchmarkStrategy::OneRule);
630 let fitted = classifier.fit(&x, &y).unwrap();
631 let predictions = fitted.predict(&x).unwrap();
632
633 assert_eq!(predictions.len(), 4);
634 }
635
636 #[test]
637 fn test_benchmark_regressor() {
638 let x =
639 Array2::from_shape_vec((4, 2), vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]).unwrap();
640 let y = array![1.0, 2.0, 3.0, 4.0];
641
642 let regressor = BenchmarkRegressor::new(BenchmarkStrategy::ZeroRule);
643 let fitted = regressor.fit(&x, &y).unwrap();
644 let predictions = fitted.predict(&x).unwrap();
645
646 let expected_mean = y.mean().unwrap();
647 for pred in predictions.iter() {
648 assert!((pred - expected_mean).abs() < 1e-10);
649 }
650 }
651
652 #[test]
653 fn test_linear_trend_regressor() {
654 let x = Array2::from_shape_vec((4, 1), vec![1.0, 2.0, 3.0, 4.0]).unwrap();
655 let y = array![1.0, 2.0, 3.0, 4.0]; let regressor = BenchmarkRegressor::new(BenchmarkStrategy::LinearTrend);
658 let fitted = regressor.fit(&x, &y).unwrap();
659 let predictions = fitted.predict(&x).unwrap();
660
661 assert_eq!(predictions.len(), 4);
662 for i in 0..predictions.len() - 1 {
664 assert!(predictions[i + 1] >= predictions[i]); }
666 }
667
668 #[test]
669 fn test_theoretical_bounds() {
670 let y_class = array![0, 0, 1, 0]; let bound = TheoreticalBounds::classification_bound(&y_class);
672 assert!((bound - 0.75).abs() < 1e-10);
673
674 let random_chance = TheoreticalBounds::random_chance_classification(2);
675 assert!((random_chance - 0.5).abs() < 1e-10);
676
677 let y_reg = array![1.0, 2.0, 3.0, 4.0];
678 let reg_bound = TheoreticalBounds::regression_bound(&y_reg);
679 assert!(reg_bound > 0.0);
680 }
681
682 #[test]
683 fn test_competition_baseline() {
684 let classifier = CompetitionBaseline::classifier();
685 let regressor = CompetitionBaseline::regressor();
686 let strategies = CompetitionBaseline::ensemble_strategies();
687
688 assert!(matches!(
689 classifier.strategy,
690 BenchmarkStrategy::CompetitionBaseline
691 ));
692 assert!(matches!(regressor.strategy, BenchmarkStrategy::ZeroRule));
693 assert_eq!(strategies.len(), 4);
694 }
695
696 #[test]
697 fn test_nearest_neighbor_baseline() {
698 let x = Array2::from_shape_vec((3, 2), vec![1.0, 1.0, 2.0, 2.0, 3.0, 3.0]).unwrap();
699 let y = array![0, 1, 0];
700
701 let classifier = BenchmarkClassifier::new(BenchmarkStrategy::NearestNeighbor);
702 let fitted = classifier.fit(&x, &y).unwrap();
703
704 let test_x = Array2::from_shape_vec((1, 2), vec![1.1, 1.1]).unwrap();
706 let predictions = fitted.predict(&test_x).unwrap();
707
708 assert_eq!(predictions[0], 0); }
710}