1use scirs2_core::ndarray::{Array1, Array2, ArrayView1, Axis};
10use scirs2_core::random::{
11 essentials::{Normal, Uniform},
12 CoreRandom,
13};
14use sklears_core::error::{Result, SklearsError};
15use sklears_core::types::Float;
16use std::collections::HashMap;
17
18#[derive(Debug, Clone)]
20pub enum RandomizationStrategy {
21 Standard,
23 RotationForest {
25 n_subsets: usize,
27 subset_fraction: Float,
29 },
30 ObliqueSplits {
32 n_features_per_split: usize,
34 },
35 RandomProjection {
37 target_dim: usize,
39 },
40}
41
42impl Default for RandomizationStrategy {
43 fn default() -> Self {
44 Self::Standard
45 }
46}
47
48#[derive(Debug, Clone)]
50pub struct FeatureBinning {
51 pub n_bins: usize,
53 pub strategy: BinningStrategy,
55 pub features_to_bin: Option<Vec<usize>>,
57}
58
59#[derive(Debug, Clone, Copy)]
61pub enum BinningStrategy {
62 Uniform,
64 Quantile,
66 KMeans,
68}
69
70impl Default for FeatureBinning {
71 fn default() -> Self {
72 Self {
73 n_bins: 10,
74 strategy: BinningStrategy::Quantile,
75 features_to_bin: None,
76 }
77 }
78}
79
80#[derive(Debug, Clone)]
82pub struct SparseConfig {
83 pub sparsity_threshold: Float,
85 pub use_sparse_optimization: bool,
87 pub compression: SparseCompression,
89}
90
91#[derive(Debug, Clone, Copy)]
93pub enum SparseCompression {
94 None,
96 COO,
98 CSR,
100 CSC,
102}
103
104impl Default for SparseConfig {
105 fn default() -> Self {
106 Self {
107 sparsity_threshold: 0.5,
108 use_sparse_optimization: true,
109 compression: SparseCompression::CSR,
110 }
111 }
112}
113
114pub fn apply_rotation_forest(
119 x: &Array2<Float>,
120 n_subsets: usize,
121 subset_fraction: Float,
122 rng: &mut CoreRandom,
123) -> Result<Array2<Float>> {
124 let n_features = x.ncols();
125 let subset_size = ((n_features as Float * subset_fraction).ceil() as usize).max(1);
126
127 let mut rotated_x = x.clone();
128
129 for _subset_idx in 0..n_subsets {
130 let feature_indices = sample_features(n_features, subset_size, rng);
132
133 let subset = select_features(x, &feature_indices);
135
136 let rotation = generate_rotation_matrix(subset_size, rng)?;
138 let rotated_subset = apply_rotation(&subset, &rotation)?;
139
140 for (i, &feat_idx) in feature_indices.iter().enumerate() {
142 if feat_idx < rotated_x.ncols() {
143 for row_idx in 0..rotated_x.nrows() {
144 rotated_x[[row_idx, feat_idx]] = rotated_subset[[row_idx, i]];
145 }
146 }
147 }
148 }
149
150 Ok(rotated_x)
151}
152
153fn sample_features(n_features: usize, n_to_sample: usize, rng: &mut CoreRandom) -> Vec<usize> {
155 let mut indices: Vec<usize> = (0..n_features).collect();
156
157 for i in 0..n_to_sample.min(n_features) {
159 let uniform = Uniform::new(i, n_features).unwrap();
160 let j = rng.sample(uniform);
161 indices.swap(i, j);
162 }
163
164 indices.truncate(n_to_sample);
165 indices
166}
167
168fn select_features(x: &Array2<Float>, indices: &[usize]) -> Array2<Float> {
170 let n_samples = x.nrows();
171 let n_features = indices.len();
172 let mut result = Array2::zeros((n_samples, n_features));
173
174 for (new_idx, &orig_idx) in indices.iter().enumerate() {
175 if orig_idx < x.ncols() {
176 for row_idx in 0..n_samples {
177 result[[row_idx, new_idx]] = x[[row_idx, orig_idx]];
178 }
179 }
180 }
181
182 result
183}
184
185fn generate_rotation_matrix(dim: usize, rng: &mut CoreRandom) -> Result<Array2<Float>> {
187 let mut matrix = Array2::zeros((dim, dim));
188 let normal = Normal::new(0.0, 1.0).map_err(|_| {
189 SklearsError::InvalidInput("Failed to create normal distribution".to_string())
190 })?;
191
192 for i in 0..dim {
194 for j in 0..dim {
195 matrix[[i, j]] = rng.sample(normal);
196 }
197 }
198
199 for j in 0..dim {
201 let col = matrix.column(j);
202 let dot_product: Float = col.dot(&col);
203 let norm = dot_product.sqrt();
204 if norm > 1e-10 {
205 for i in 0..dim {
206 matrix[[i, j]] /= norm;
207 }
208 }
209 }
210
211 Ok(matrix)
212}
213
214fn apply_rotation(x: &Array2<Float>, rotation: &Array2<Float>) -> Result<Array2<Float>> {
216 if x.ncols() != rotation.nrows() {
217 return Err(SklearsError::ShapeMismatch {
218 expected: format!("x.ncols() == rotation.nrows() ({})", rotation.nrows()),
219 actual: format!("x.ncols() = {}", x.ncols()),
220 });
221 }
222
223 let n_samples = x.nrows();
225 let n_features = rotation.ncols();
226 let mut result = Array2::zeros((n_samples, n_features));
227
228 for i in 0..n_samples {
229 for j in 0..n_features {
230 let mut sum = 0.0;
231 for k in 0..x.ncols() {
232 sum += x[[i, k]] * rotation[[k, j]];
233 }
234 result[[i, j]] = sum;
235 }
236 }
237
238 Ok(result)
239}
240
241pub fn generate_oblique_split(
245 x: &Array2<Float>,
246 n_features_per_split: usize,
247 rng: &mut CoreRandom,
248) -> Result<(Array1<Float>, Float)> {
249 let n_features = x.ncols();
250 let n_to_use = n_features_per_split.min(n_features);
251
252 let mut normal = Array1::zeros(n_features);
254 let dist = Normal::new(0.0, 1.0).map_err(|_| {
255 SklearsError::InvalidInput("Failed to create normal distribution".to_string())
256 })?;
257
258 let feature_indices = sample_features(n_features, n_to_use, rng);
260 for &idx in &feature_indices {
261 normal[idx] = rng.sample(dist);
262 }
263
264 let dot_product: Float = normal.dot(&normal);
266 let norm = dot_product.sqrt();
267 if norm > 1e-10 {
268 normal /= norm;
269 }
270
271 let projections = compute_projections(x, &normal);
273 let min_proj = projections
274 .iter()
275 .cloned()
276 .fold(Float::INFINITY, Float::min);
277 let max_proj = projections
278 .iter()
279 .cloned()
280 .fold(Float::NEG_INFINITY, Float::max);
281
282 let threshold_dist = Uniform::new(min_proj, max_proj).map_err(|_| {
283 SklearsError::InvalidInput("Failed to create threshold distribution".to_string())
284 })?;
285 let threshold = rng.sample(threshold_dist);
286
287 Ok((normal, threshold))
288}
289
290fn compute_projections(x: &Array2<Float>, direction: &Array1<Float>) -> Vec<Float> {
292 let mut projections = Vec::with_capacity(x.nrows());
293
294 for row in x.axis_iter(Axis(0)) {
295 let projection = row.dot(direction);
296 projections.push(projection);
297 }
298
299 projections
300}
301
302pub fn apply_feature_binning(x: &Array2<Float>, binning: &FeatureBinning) -> Result<Array2<Float>> {
304 let mut binned_x = x.clone();
305
306 let features_to_process = match &binning.features_to_bin {
307 Some(indices) => indices.clone(),
308 None => (0..x.ncols()).collect(),
309 };
310
311 for &feat_idx in &features_to_process {
312 if feat_idx >= x.ncols() {
313 continue;
314 }
315
316 let feature_col = x.column(feat_idx);
317 let bin_edges = compute_bin_edges(&feature_col, binning.n_bins, binning.strategy)?;
318
319 for row_idx in 0..x.nrows() {
321 let value = x[[row_idx, feat_idx]];
322 let bin_idx = find_bin(value, &bin_edges);
323 binned_x[[row_idx, feat_idx]] = bin_idx as Float;
324 }
325 }
326
327 Ok(binned_x)
328}
329
330fn compute_bin_edges(
332 feature: &ArrayView1<Float>,
333 n_bins: usize,
334 strategy: BinningStrategy,
335) -> Result<Vec<Float>> {
336 let mut values: Vec<Float> = feature.to_vec();
337
338 match strategy {
339 BinningStrategy::Uniform => {
340 let min_val = values.iter().cloned().fold(Float::INFINITY, Float::min);
341 let max_val = values.iter().cloned().fold(Float::NEG_INFINITY, Float::max);
342
343 let mut edges = Vec::with_capacity(n_bins + 1);
344 for i in 0..=n_bins {
345 edges.push(min_val + (max_val - min_val) * (i as Float / n_bins as Float));
346 }
347 Ok(edges)
348 }
349
350 BinningStrategy::Quantile => {
351 values.sort_by(|a, b| a.partial_cmp(b).unwrap());
352
353 let mut edges = Vec::with_capacity(n_bins + 1);
354 for i in 0..=n_bins {
355 let idx = ((values.len() - 1) as Float * (i as Float / n_bins as Float)) as usize;
356 edges.push(values[idx]);
357 }
358
359 edges.dedup();
361 Ok(edges)
362 }
363
364 BinningStrategy::KMeans => {
365 values.sort_by(|a, b| a.partial_cmp(b).unwrap());
367
368 let mut centers = Vec::with_capacity(n_bins);
369 for i in 0..n_bins {
370 let idx =
371 ((values.len() - 1) as Float * ((i as Float + 0.5) / n_bins as Float)) as usize;
372 centers.push(values[idx]);
373 }
374
375 for _ in 0..5 {
377 let mut new_centers = vec![0.0; n_bins];
378 let mut counts = vec![0; n_bins];
379
380 for &val in &values {
381 let mut best_center = 0;
382 let mut best_dist = Float::INFINITY;
383
384 for (c_idx, ¢er) in centers.iter().enumerate() {
385 let dist = (val - center).abs();
386 if dist < best_dist {
387 best_dist = dist;
388 best_center = c_idx;
389 }
390 }
391
392 new_centers[best_center] += val;
393 counts[best_center] += 1;
394 }
395
396 for i in 0..n_bins {
397 if counts[i] > 0 {
398 centers[i] = new_centers[i] / counts[i] as Float;
399 }
400 }
401 }
402
403 centers.sort_by(|a, b| a.partial_cmp(b).unwrap());
404
405 let mut edges = vec![Float::NEG_INFINITY];
407 for i in 0..centers.len() - 1 {
408 edges.push((centers[i] + centers[i + 1]) / 2.0);
409 }
410 edges.push(Float::INFINITY);
411
412 Ok(edges)
413 }
414 }
415}
416
417fn find_bin(value: Float, bin_edges: &[Float]) -> usize {
419 for (i, &edge) in bin_edges.iter().enumerate().skip(1) {
420 if value <= edge {
421 return i - 1;
422 }
423 }
424 bin_edges.len().saturating_sub(2)
425}
426
427pub fn analyze_sparsity(x: &Array2<Float>) -> HashMap<usize, Float> {
429 let mut sparsity_map = HashMap::new();
430
431 for feat_idx in 0..x.ncols() {
432 let feature_col = x.column(feat_idx);
433 let n_zeros = feature_col.iter().filter(|&&v| v.abs() < 1e-10).count();
434 let sparsity = n_zeros as Float / feature_col.len() as Float;
435 sparsity_map.insert(feat_idx, sparsity);
436 }
437
438 sparsity_map
439}
440
441#[derive(Debug, Clone)]
443pub struct SparseFeature {
444 pub indices: Vec<usize>,
446 pub values: Vec<Float>,
448 pub length: usize,
450}
451
452impl SparseFeature {
453 pub fn from_dense(data: &ArrayView1<Float>) -> Self {
455 let mut indices = Vec::new();
456 let mut values = Vec::new();
457
458 for (i, &val) in data.iter().enumerate() {
459 if val.abs() > 1e-10 {
460 indices.push(i);
461 values.push(val);
462 }
463 }
464
465 Self {
466 indices,
467 values,
468 length: data.len(),
469 }
470 }
471
472 pub fn sparsity(&self) -> Float {
474 1.0 - (self.values.len() as Float / self.length as Float)
475 }
476
477 pub fn get(&self, idx: usize) -> Float {
479 self.indices
480 .iter()
481 .position(|&i| i == idx)
482 .map(|pos| self.values[pos])
483 .unwrap_or(0.0)
484 }
485}
486
487pub fn to_sparse_if_beneficial(
489 x: &Array2<Float>,
490 config: &SparseConfig,
491) -> Result<Vec<SparseFeature>> {
492 let sparsity_map = analyze_sparsity(x);
493 let mut sparse_features = Vec::with_capacity(x.ncols());
494
495 for feat_idx in 0..x.ncols() {
496 let sparsity = sparsity_map.get(&feat_idx).copied().unwrap_or(0.0);
497
498 if sparsity >= config.sparsity_threshold && config.use_sparse_optimization {
499 let feature_col = x.column(feat_idx);
500 sparse_features.push(SparseFeature::from_dense(&feature_col));
501 } else {
502 let feature_col = x.column(feat_idx);
504 let indices: Vec<usize> = (0..feature_col.len()).collect();
505 let values: Vec<Float> = feature_col.to_vec();
506 sparse_features.push(SparseFeature {
507 indices,
508 values,
509 length: feature_col.len(),
510 });
511 }
512 }
513
514 Ok(sparse_features)
515}
516
517#[cfg(test)]
518mod tests {
519 use super::*;
520 use scirs2_core::ndarray::array;
521 use scirs2_core::random::thread_rng;
522
523 #[test]
524 fn test_rotation_forest() {
525 let x = array![[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]];
526
527 let mut rng = thread_rng();
528 let result = apply_rotation_forest(&x, 2, 0.6, &mut rng).unwrap();
529
530 assert_eq!(result.shape(), x.shape());
531 }
532
533 #[test]
534 fn test_oblique_split() {
535 let x = array![[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]];
536
537 let mut rng = thread_rng();
538 let (normal, threshold) = generate_oblique_split(&x, 2, &mut rng).unwrap();
539
540 assert_eq!(normal.len(), 2);
541 assert!(threshold.is_finite());
542 }
543
544 #[test]
545 fn test_feature_binning_uniform() {
546 let x = array![[1.0, 5.0], [2.0, 6.0], [3.0, 7.0], [4.0, 8.0]];
547
548 let binning = FeatureBinning {
549 n_bins: 2,
550 strategy: BinningStrategy::Uniform,
551 features_to_bin: Some(vec![0]),
552 };
553
554 let result = apply_feature_binning(&x, &binning).unwrap();
555 assert_eq!(result.shape(), x.shape());
556
557 let binned_col = result.column(0);
559 for &val in binned_col.iter() {
560 assert!(val >= 0.0 && val < 2.0);
561 }
562 }
563
564 #[test]
565 fn test_sparsity_analysis() {
566 let x = array![[1.0, 0.0, 0.0], [0.0, 2.0, 0.0], [0.0, 0.0, 3.0]];
567
568 let sparsity_map = analyze_sparsity(&x);
569
570 for feat_idx in 0..3 {
572 let sparsity = sparsity_map.get(&feat_idx).unwrap();
573 assert!((*sparsity - 2.0 / 3.0).abs() < 0.01);
574 }
575 }
576
577 #[test]
578 fn test_sparse_feature() {
579 let dense = array![1.0, 0.0, 0.0, 2.0, 0.0];
580 let sparse = SparseFeature::from_dense(&dense.view());
581
582 assert_eq!(sparse.indices, vec![0, 3]);
583 assert_eq!(sparse.values, vec![1.0, 2.0]);
584 assert_eq!(sparse.length, 5);
585 assert_eq!(sparse.sparsity(), 0.6);
586
587 assert_eq!(sparse.get(0), 1.0);
588 assert_eq!(sparse.get(1), 0.0);
589 assert_eq!(sparse.get(3), 2.0);
590 }
591
592 #[test]
593 fn test_to_sparse_if_beneficial() {
594 let x = array![[1.0, 0.0, 0.0], [0.0, 2.0, 0.0], [0.0, 0.0, 3.0]];
595
596 let config = SparseConfig {
597 sparsity_threshold: 0.5,
598 use_sparse_optimization: true,
599 compression: SparseCompression::CSR,
600 };
601
602 let sparse_features = to_sparse_if_beneficial(&x, &config).unwrap();
603 assert_eq!(sparse_features.len(), 3);
604
605 for sf in &sparse_features {
607 assert!(sf.sparsity() >= 0.5);
608 }
609 }
610}