sklears_tree/
extra_trees_enhanced.rs

1//! Enhanced Extra Trees with Advanced Randomization Strategies
2//!
3//! This module provides advanced randomization techniques for Extra Trees:
4//! - Rotation Forests (PCA-based feature transformation)
5//! - Oblique splits (hyperplane-based splits)
6//! - Feature binning for categorical/discrete features
7//! - Sparse feature optimization
8
9use scirs2_core::ndarray::{Array1, Array2, ArrayView1, Axis};
10use scirs2_core::random::{
11    essentials::{Normal, Uniform},
12    CoreRandom,
13};
14use sklears_core::error::{Result, SklearsError};
15use sklears_core::types::Float;
16use std::collections::HashMap;
17
18/// Advanced randomization strategy for Extra Trees
19#[derive(Debug, Clone)]
20pub enum RandomizationStrategy {
21    /// Standard Extra Trees with random thresholds
22    Standard,
23    /// Rotation Forest: Apply PCA rotation to feature subsets
24    RotationForest {
25        /// Number of feature subsets to create
26        n_subsets: usize,
27        /// Fraction of features per subset
28        subset_fraction: Float,
29    },
30    /// Oblique splits using random hyperplanes
31    ObliqueSplits {
32        /// Number of features to combine in each split
33        n_features_per_split: usize,
34    },
35    /// Random projections for dimensionality reduction
36    RandomProjection {
37        /// Target dimensionality
38        target_dim: usize,
39    },
40}
41
42impl Default for RandomizationStrategy {
43    fn default() -> Self {
44        Self::Standard
45    }
46}
47
48/// Feature binning configuration for categorical/discrete features
49#[derive(Debug, Clone)]
50pub struct FeatureBinning {
51    /// Number of bins per feature
52    pub n_bins: usize,
53    /// Binning strategy
54    pub strategy: BinningStrategy,
55    /// Features to bin (None = all features)
56    pub features_to_bin: Option<Vec<usize>>,
57}
58
59/// Binning strategy for discretizing continuous features
60#[derive(Debug, Clone, Copy)]
61pub enum BinningStrategy {
62    /// Equal-width binning
63    Uniform,
64    /// Quantile-based binning (equal frequency)
65    Quantile,
66    /// K-means clustering for binning
67    KMeans,
68}
69
70impl Default for FeatureBinning {
71    fn default() -> Self {
72        Self {
73            n_bins: 10,
74            strategy: BinningStrategy::Quantile,
75            features_to_bin: None,
76        }
77    }
78}
79
80/// Sparse feature configuration
81#[derive(Debug, Clone)]
82pub struct SparseConfig {
83    /// Sparsity threshold (fraction of zeros to consider sparse)
84    pub sparsity_threshold: Float,
85    /// Whether to use specialized sparse algorithms
86    pub use_sparse_optimization: bool,
87    /// Compression strategy for sparse features
88    pub compression: SparseCompression,
89}
90
91/// Sparse feature compression strategy
92#[derive(Debug, Clone, Copy)]
93pub enum SparseCompression {
94    /// No compression
95    None,
96    /// Coordinate list (COO) format
97    COO,
98    /// Compressed sparse row (CSR) format
99    CSR,
100    /// Compressed sparse column (CSC) format
101    CSC,
102}
103
104impl Default for SparseConfig {
105    fn default() -> Self {
106        Self {
107            sparsity_threshold: 0.5,
108            use_sparse_optimization: true,
109            compression: SparseCompression::CSR,
110        }
111    }
112}
113
114/// Apply rotation forest transformation to features
115///
116/// Rotation Forest applies PCA to feature subsets and concatenates the rotated features.
117/// This can improve ensemble diversity and performance.
118pub fn apply_rotation_forest(
119    x: &Array2<Float>,
120    n_subsets: usize,
121    subset_fraction: Float,
122    rng: &mut CoreRandom,
123) -> Result<Array2<Float>> {
124    let n_features = x.ncols();
125    let subset_size = ((n_features as Float * subset_fraction).ceil() as usize).max(1);
126
127    let mut rotated_x = x.clone();
128
129    for _subset_idx in 0..n_subsets {
130        // Randomly select feature subset
131        let feature_indices = sample_features(n_features, subset_size, rng);
132
133        // Extract subset
134        let subset = select_features(x, &feature_indices);
135
136        // Apply PCA rotation (simplified: just add random rotation)
137        let rotation = generate_rotation_matrix(subset_size, rng)?;
138        let rotated_subset = apply_rotation(&subset, &rotation)?;
139
140        // Update features in place
141        for (i, &feat_idx) in feature_indices.iter().enumerate() {
142            if feat_idx < rotated_x.ncols() {
143                for row_idx in 0..rotated_x.nrows() {
144                    rotated_x[[row_idx, feat_idx]] = rotated_subset[[row_idx, i]];
145                }
146            }
147        }
148    }
149
150    Ok(rotated_x)
151}
152
153/// Sample random feature indices
154fn sample_features(n_features: usize, n_to_sample: usize, rng: &mut CoreRandom) -> Vec<usize> {
155    let mut indices: Vec<usize> = (0..n_features).collect();
156
157    // Fisher-Yates shuffle for first n_to_sample elements
158    for i in 0..n_to_sample.min(n_features) {
159        let uniform = Uniform::new(i, n_features).unwrap();
160        let j = rng.sample(uniform);
161        indices.swap(i, j);
162    }
163
164    indices.truncate(n_to_sample);
165    indices
166}
167
168/// Select specific features from array
169fn select_features(x: &Array2<Float>, indices: &[usize]) -> Array2<Float> {
170    let n_samples = x.nrows();
171    let n_features = indices.len();
172    let mut result = Array2::zeros((n_samples, n_features));
173
174    for (new_idx, &orig_idx) in indices.iter().enumerate() {
175        if orig_idx < x.ncols() {
176            for row_idx in 0..n_samples {
177                result[[row_idx, new_idx]] = x[[row_idx, orig_idx]];
178            }
179        }
180    }
181
182    result
183}
184
185/// Generate random rotation matrix
186fn generate_rotation_matrix(dim: usize, rng: &mut CoreRandom) -> Result<Array2<Float>> {
187    let mut matrix = Array2::zeros((dim, dim));
188    let normal = Normal::new(0.0, 1.0).map_err(|_| {
189        SklearsError::InvalidInput("Failed to create normal distribution".to_string())
190    })?;
191
192    // Generate random matrix
193    for i in 0..dim {
194        for j in 0..dim {
195            matrix[[i, j]] = rng.sample(normal);
196        }
197    }
198
199    // Simple normalization (not true Gram-Schmidt, but sufficient for randomization)
200    for j in 0..dim {
201        let col = matrix.column(j);
202        let dot_product: Float = col.dot(&col);
203        let norm = dot_product.sqrt();
204        if norm > 1e-10 {
205            for i in 0..dim {
206                matrix[[i, j]] /= norm;
207            }
208        }
209    }
210
211    Ok(matrix)
212}
213
214/// Apply rotation matrix to data
215fn apply_rotation(x: &Array2<Float>, rotation: &Array2<Float>) -> Result<Array2<Float>> {
216    if x.ncols() != rotation.nrows() {
217        return Err(SklearsError::ShapeMismatch {
218            expected: format!("x.ncols() == rotation.nrows() ({})", rotation.nrows()),
219            actual: format!("x.ncols() = {}", x.ncols()),
220        });
221    }
222
223    // Matrix multiplication: x * rotation
224    let n_samples = x.nrows();
225    let n_features = rotation.ncols();
226    let mut result = Array2::zeros((n_samples, n_features));
227
228    for i in 0..n_samples {
229        for j in 0..n_features {
230            let mut sum = 0.0;
231            for k in 0..x.ncols() {
232                sum += x[[i, k]] * rotation[[k, j]];
233            }
234            result[[i, j]] = sum;
235        }
236    }
237
238    Ok(result)
239}
240
241/// Apply oblique splits using random hyperplanes
242///
243/// Instead of axis-aligned splits, oblique splits use random linear combinations of features.
244pub fn generate_oblique_split(
245    x: &Array2<Float>,
246    n_features_per_split: usize,
247    rng: &mut CoreRandom,
248) -> Result<(Array1<Float>, Float)> {
249    let n_features = x.ncols();
250    let n_to_use = n_features_per_split.min(n_features);
251
252    // Generate random normal vector
253    let mut normal = Array1::zeros(n_features);
254    let dist = Normal::new(0.0, 1.0).map_err(|_| {
255        SklearsError::InvalidInput("Failed to create normal distribution".to_string())
256    })?;
257
258    // Select random features and assign random weights
259    let feature_indices = sample_features(n_features, n_to_use, rng);
260    for &idx in &feature_indices {
261        normal[idx] = rng.sample(dist);
262    }
263
264    // Normalize
265    let dot_product: Float = normal.dot(&normal);
266    let norm = dot_product.sqrt();
267    if norm > 1e-10 {
268        normal /= norm;
269    }
270
271    // Compute projections and random threshold
272    let projections = compute_projections(x, &normal);
273    let min_proj = projections
274        .iter()
275        .cloned()
276        .fold(Float::INFINITY, Float::min);
277    let max_proj = projections
278        .iter()
279        .cloned()
280        .fold(Float::NEG_INFINITY, Float::max);
281
282    let threshold_dist = Uniform::new(min_proj, max_proj).map_err(|_| {
283        SklearsError::InvalidInput("Failed to create threshold distribution".to_string())
284    })?;
285    let threshold = rng.sample(threshold_dist);
286
287    Ok((normal, threshold))
288}
289
290/// Compute projections of samples onto a direction vector
291fn compute_projections(x: &Array2<Float>, direction: &Array1<Float>) -> Vec<Float> {
292    let mut projections = Vec::with_capacity(x.nrows());
293
294    for row in x.axis_iter(Axis(0)) {
295        let projection = row.dot(direction);
296        projections.push(projection);
297    }
298
299    projections
300}
301
302/// Apply feature binning to discretize continuous features
303pub fn apply_feature_binning(x: &Array2<Float>, binning: &FeatureBinning) -> Result<Array2<Float>> {
304    let mut binned_x = x.clone();
305
306    let features_to_process = match &binning.features_to_bin {
307        Some(indices) => indices.clone(),
308        None => (0..x.ncols()).collect(),
309    };
310
311    for &feat_idx in &features_to_process {
312        if feat_idx >= x.ncols() {
313            continue;
314        }
315
316        let feature_col = x.column(feat_idx);
317        let bin_edges = compute_bin_edges(&feature_col, binning.n_bins, binning.strategy)?;
318
319        // Assign samples to bins
320        for row_idx in 0..x.nrows() {
321            let value = x[[row_idx, feat_idx]];
322            let bin_idx = find_bin(value, &bin_edges);
323            binned_x[[row_idx, feat_idx]] = bin_idx as Float;
324        }
325    }
326
327    Ok(binned_x)
328}
329
330/// Compute bin edges for a feature
331fn compute_bin_edges(
332    feature: &ArrayView1<Float>,
333    n_bins: usize,
334    strategy: BinningStrategy,
335) -> Result<Vec<Float>> {
336    let mut values: Vec<Float> = feature.to_vec();
337
338    match strategy {
339        BinningStrategy::Uniform => {
340            let min_val = values.iter().cloned().fold(Float::INFINITY, Float::min);
341            let max_val = values.iter().cloned().fold(Float::NEG_INFINITY, Float::max);
342
343            let mut edges = Vec::with_capacity(n_bins + 1);
344            for i in 0..=n_bins {
345                edges.push(min_val + (max_val - min_val) * (i as Float / n_bins as Float));
346            }
347            Ok(edges)
348        }
349
350        BinningStrategy::Quantile => {
351            values.sort_by(|a, b| a.partial_cmp(b).unwrap());
352
353            let mut edges = Vec::with_capacity(n_bins + 1);
354            for i in 0..=n_bins {
355                let idx = ((values.len() - 1) as Float * (i as Float / n_bins as Float)) as usize;
356                edges.push(values[idx]);
357            }
358
359            // Remove duplicates
360            edges.dedup();
361            Ok(edges)
362        }
363
364        BinningStrategy::KMeans => {
365            // Simplified K-means: use quantiles as initial centers
366            values.sort_by(|a, b| a.partial_cmp(b).unwrap());
367
368            let mut centers = Vec::with_capacity(n_bins);
369            for i in 0..n_bins {
370                let idx =
371                    ((values.len() - 1) as Float * ((i as Float + 0.5) / n_bins as Float)) as usize;
372                centers.push(values[idx]);
373            }
374
375            // Simple iteration (not full K-means)
376            for _ in 0..5 {
377                let mut new_centers = vec![0.0; n_bins];
378                let mut counts = vec![0; n_bins];
379
380                for &val in &values {
381                    let mut best_center = 0;
382                    let mut best_dist = Float::INFINITY;
383
384                    for (c_idx, &center) in centers.iter().enumerate() {
385                        let dist = (val - center).abs();
386                        if dist < best_dist {
387                            best_dist = dist;
388                            best_center = c_idx;
389                        }
390                    }
391
392                    new_centers[best_center] += val;
393                    counts[best_center] += 1;
394                }
395
396                for i in 0..n_bins {
397                    if counts[i] > 0 {
398                        centers[i] = new_centers[i] / counts[i] as Float;
399                    }
400                }
401            }
402
403            centers.sort_by(|a, b| a.partial_cmp(b).unwrap());
404
405            // Create edges from centers
406            let mut edges = vec![Float::NEG_INFINITY];
407            for i in 0..centers.len() - 1 {
408                edges.push((centers[i] + centers[i + 1]) / 2.0);
409            }
410            edges.push(Float::INFINITY);
411
412            Ok(edges)
413        }
414    }
415}
416
417/// Find which bin a value belongs to
418fn find_bin(value: Float, bin_edges: &[Float]) -> usize {
419    for (i, &edge) in bin_edges.iter().enumerate().skip(1) {
420        if value <= edge {
421            return i - 1;
422        }
423    }
424    bin_edges.len().saturating_sub(2)
425}
426
427/// Detect sparse features and compute sparsity statistics
428pub fn analyze_sparsity(x: &Array2<Float>) -> HashMap<usize, Float> {
429    let mut sparsity_map = HashMap::new();
430
431    for feat_idx in 0..x.ncols() {
432        let feature_col = x.column(feat_idx);
433        let n_zeros = feature_col.iter().filter(|&&v| v.abs() < 1e-10).count();
434        let sparsity = n_zeros as Float / feature_col.len() as Float;
435        sparsity_map.insert(feat_idx, sparsity);
436    }
437
438    sparsity_map
439}
440
441/// Sparse feature representation
442#[derive(Debug, Clone)]
443pub struct SparseFeature {
444    /// Non-zero indices
445    pub indices: Vec<usize>,
446    /// Non-zero values
447    pub values: Vec<Float>,
448    /// Total length
449    pub length: usize,
450}
451
452impl SparseFeature {
453    /// Create from dense array
454    pub fn from_dense(data: &ArrayView1<Float>) -> Self {
455        let mut indices = Vec::new();
456        let mut values = Vec::new();
457
458        for (i, &val) in data.iter().enumerate() {
459            if val.abs() > 1e-10 {
460                indices.push(i);
461                values.push(val);
462            }
463        }
464
465        Self {
466            indices,
467            values,
468            length: data.len(),
469        }
470    }
471
472    /// Compute sparsity ratio
473    pub fn sparsity(&self) -> Float {
474        1.0 - (self.values.len() as Float / self.length as Float)
475    }
476
477    /// Get value at index
478    pub fn get(&self, idx: usize) -> Float {
479        self.indices
480            .iter()
481            .position(|&i| i == idx)
482            .map(|pos| self.values[pos])
483            .unwrap_or(0.0)
484    }
485}
486
487/// Convert to sparse representation if beneficial
488pub fn to_sparse_if_beneficial(
489    x: &Array2<Float>,
490    config: &SparseConfig,
491) -> Result<Vec<SparseFeature>> {
492    let sparsity_map = analyze_sparsity(x);
493    let mut sparse_features = Vec::with_capacity(x.ncols());
494
495    for feat_idx in 0..x.ncols() {
496        let sparsity = sparsity_map.get(&feat_idx).copied().unwrap_or(0.0);
497
498        if sparsity >= config.sparsity_threshold && config.use_sparse_optimization {
499            let feature_col = x.column(feat_idx);
500            sparse_features.push(SparseFeature::from_dense(&feature_col));
501        } else {
502            // Keep as dense (convert to sparse with all values)
503            let feature_col = x.column(feat_idx);
504            let indices: Vec<usize> = (0..feature_col.len()).collect();
505            let values: Vec<Float> = feature_col.to_vec();
506            sparse_features.push(SparseFeature {
507                indices,
508                values,
509                length: feature_col.len(),
510            });
511        }
512    }
513
514    Ok(sparse_features)
515}
516
517#[cfg(test)]
518mod tests {
519    use super::*;
520    use scirs2_core::ndarray::array;
521    use scirs2_core::random::thread_rng;
522
523    #[test]
524    fn test_rotation_forest() {
525        let x = array![[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]];
526
527        let mut rng = thread_rng();
528        let result = apply_rotation_forest(&x, 2, 0.6, &mut rng).unwrap();
529
530        assert_eq!(result.shape(), x.shape());
531    }
532
533    #[test]
534    fn test_oblique_split() {
535        let x = array![[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]];
536
537        let mut rng = thread_rng();
538        let (normal, threshold) = generate_oblique_split(&x, 2, &mut rng).unwrap();
539
540        assert_eq!(normal.len(), 2);
541        assert!(threshold.is_finite());
542    }
543
544    #[test]
545    fn test_feature_binning_uniform() {
546        let x = array![[1.0, 5.0], [2.0, 6.0], [3.0, 7.0], [4.0, 8.0]];
547
548        let binning = FeatureBinning {
549            n_bins: 2,
550            strategy: BinningStrategy::Uniform,
551            features_to_bin: Some(vec![0]),
552        };
553
554        let result = apply_feature_binning(&x, &binning).unwrap();
555        assert_eq!(result.shape(), x.shape());
556
557        // Values should be binned
558        let binned_col = result.column(0);
559        for &val in binned_col.iter() {
560            assert!(val >= 0.0 && val < 2.0);
561        }
562    }
563
564    #[test]
565    fn test_sparsity_analysis() {
566        let x = array![[1.0, 0.0, 0.0], [0.0, 2.0, 0.0], [0.0, 0.0, 3.0]];
567
568        let sparsity_map = analyze_sparsity(&x);
569
570        // Each feature should be 2/3 sparse
571        for feat_idx in 0..3 {
572            let sparsity = sparsity_map.get(&feat_idx).unwrap();
573            assert!((*sparsity - 2.0 / 3.0).abs() < 0.01);
574        }
575    }
576
577    #[test]
578    fn test_sparse_feature() {
579        let dense = array![1.0, 0.0, 0.0, 2.0, 0.0];
580        let sparse = SparseFeature::from_dense(&dense.view());
581
582        assert_eq!(sparse.indices, vec![0, 3]);
583        assert_eq!(sparse.values, vec![1.0, 2.0]);
584        assert_eq!(sparse.length, 5);
585        assert_eq!(sparse.sparsity(), 0.6);
586
587        assert_eq!(sparse.get(0), 1.0);
588        assert_eq!(sparse.get(1), 0.0);
589        assert_eq!(sparse.get(3), 2.0);
590    }
591
592    #[test]
593    fn test_to_sparse_if_beneficial() {
594        let x = array![[1.0, 0.0, 0.0], [0.0, 2.0, 0.0], [0.0, 0.0, 3.0]];
595
596        let config = SparseConfig {
597            sparsity_threshold: 0.5,
598            use_sparse_optimization: true,
599            compression: SparseCompression::CSR,
600        };
601
602        let sparse_features = to_sparse_if_beneficial(&x, &config).unwrap();
603        assert_eq!(sparse_features.len(), 3);
604
605        // All features should be sparse
606        for sf in &sparse_features {
607            assert!(sf.sparsity() >= 0.5);
608        }
609    }
610}