treeboost 0.1.0

High-performance Gradient Boosted Decision Tree engine for large-scale tabular data
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
//! Power transformations for normalizing distributions
//!
//! This module provides power transforms that make data more Gaussian-like:
//!
//! ## Yeo-Johnson Transform
//! - Handles **both positive and negative values** (unlike Box-Cox or log)
//! - Makes data more normally distributed (critical for linear models)
//! - For trees: minimal impact, but helpful for mixed ensembles
//!
//! ## Design Philosophy
//!
//! Power transforms are **ESSENTIAL** for linear model components in mixed ensembles.
//! Linear models (OLS, Ridge, Lasso) assume Gaussian residuals, which requires
//! approximately Gaussian features. Yeo-Johnson normalizes skewed distributions.

use crate::{Result, TreeBoostError};

// =============================================================================
// Yeo-Johnson Transform
// =============================================================================

/// Yeo-Johnson power transform for normalizing distributions
///
/// Unlike Box-Cox, Yeo-Johnson handles **both positive and negative values**.
/// It applies a power transformation controlled by parameter λ (lambda).
///
/// # Transform Definition
///
/// For input x and parameter λ:
/// - If x ≥ 0 and λ ≠ 0: y = ((x + 1)^λ - 1) / λ
/// - If x ≥ 0 and λ = 0: y = log(x + 1)
/// - If x < 0 and λ ≠ 2: y = -((-x + 1)^(2-λ) - 1) / (2 - λ)
/// - If x < 0 and λ = 2: y = -log(-x + 1)
///
/// # Example
///
/// ```rust
/// use treeboost::preprocessing::YeoJohnsonTransform;
///
/// let mut transform = YeoJohnsonTransform::new();
///
/// // Data with skewed distribution
/// let mut data = vec![0.1, 1.0, 10.0, 100.0, -5.0, -1.0]; // 2 rows × 3 features
///
/// transform.fit(&data, 3).unwrap();
/// transform.transform(&mut data, 3).unwrap();
/// // Data is now more Gaussian-like
/// ```
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct YeoJohnsonTransform {
    /// Lambda parameter per feature (learned during fit)
    lambdas: Vec<f32>,
    /// Whether the transform has been fitted
    fitted: bool,
    /// Max iterations for lambda optimization
    max_iter: usize,
    /// Tolerance for lambda optimization
    tolerance: f32,
}

impl Default for YeoJohnsonTransform {
    fn default() -> Self {
        Self::new()
    }
}

impl YeoJohnsonTransform {
    /// Create a new Yeo-Johnson transform
    pub fn new() -> Self {
        Self {
            lambdas: Vec::new(),
            fitted: false,
            max_iter: 100,
            tolerance: 1e-6,
        }
    }

    /// Set maximum iterations for lambda optimization
    pub fn with_max_iter(mut self, max_iter: usize) -> Self {
        self.max_iter = max_iter;
        self
    }

    /// Set tolerance for lambda optimization
    pub fn with_tolerance(mut self, tolerance: f32) -> Self {
        self.tolerance = tolerance;
        self
    }

    /// Create a transform with fixed lambdas (skip fitting)
    pub fn with_lambdas(lambdas: Vec<f32>) -> Self {
        Self {
            lambdas,
            fitted: true,
            max_iter: 100,
            tolerance: 1e-6,
        }
    }

    /// Fit the transform by finding optimal lambda for each feature
    ///
    /// Uses Brent's method to maximize log-likelihood.
    /// Data is in row-major format: `data[row * num_features + col]`
    pub fn fit(&mut self, data: &[f32], num_features: usize) -> Result<()> {
        if data.is_empty() {
            return Err(TreeBoostError::Data("Cannot fit on empty data".into()));
        }

        let num_rows = data.len() / num_features;
        if data.len() != num_rows * num_features {
            return Err(TreeBoostError::Data(format!(
                "Data length {} is not divisible by num_features {}",
                data.len(),
                num_features
            )));
        }

        self.lambdas = Vec::with_capacity(num_features);

        for col in 0..num_features {
            // Extract column values (skip NaN)
            let values: Vec<f32> = (0..num_rows)
                .map(|row| data[row * num_features + col])
                .filter(|v| !v.is_nan())
                .collect();

            if values.is_empty() {
                // All NaN - use lambda = 1 (identity-ish)
                self.lambdas.push(1.0);
                continue;
            }

            // Find optimal lambda using Brent's method
            let optimal_lambda = self.find_optimal_lambda(&values);
            self.lambdas.push(optimal_lambda);
        }

        self.fitted = true;
        Ok(())
    }

    /// Transform data in-place using fitted lambdas
    pub fn transform(&self, data: &mut [f32], num_features: usize) -> Result<()> {
        if !self.fitted {
            return Err(TreeBoostError::Config(
                "YeoJohnsonTransform not fitted. Call fit() first.".into(),
            ));
        }

        if self.lambdas.len() != num_features {
            return Err(TreeBoostError::Config(format!(
                "Feature count mismatch: fitted with {} features, got {}",
                self.lambdas.len(),
                num_features
            )));
        }

        let num_rows = data.len() / num_features;

        for row in 0..num_rows {
            for col in 0..num_features {
                let idx = row * num_features + col;
                if !data[idx].is_nan() {
                    data[idx] = yeo_johnson_transform(data[idx], self.lambdas[col]);
                }
            }
        }

        Ok(())
    }

    /// Inverse transform (convert back to original scale)
    pub fn inverse_transform(&self, data: &mut [f32], num_features: usize) -> Result<()> {
        if !self.fitted {
            return Err(TreeBoostError::Config(
                "YeoJohnsonTransform not fitted. Call fit() first.".into(),
            ));
        }

        if self.lambdas.len() != num_features {
            return Err(TreeBoostError::Config(format!(
                "Feature count mismatch: fitted with {} features, got {}",
                self.lambdas.len(),
                num_features
            )));
        }

        let num_rows = data.len() / num_features;

        for row in 0..num_rows {
            for col in 0..num_features {
                let idx = row * num_features + col;
                if !data[idx].is_nan() {
                    data[idx] = yeo_johnson_inverse(data[idx], self.lambdas[col]);
                }
            }
        }

        Ok(())
    }

    /// Fit and transform in one step
    pub fn fit_transform(&mut self, data: &mut [f32], num_features: usize) -> Result<()> {
        self.fit(data, num_features)?;
        self.transform(data, num_features)?;
        Ok(())
    }

    /// Check if the transform has been fitted
    pub fn is_fitted(&self) -> bool {
        self.fitted
    }

    /// Get the fitted lambdas
    pub fn lambdas(&self) -> &[f32] {
        &self.lambdas
    }

    /// Find optimal lambda for a single feature using golden section search
    fn find_optimal_lambda(&self, values: &[f32]) -> f32 {
        // Search bounds for lambda
        let mut a = -5.0f32;
        let mut b = 5.0f32;

        // Golden ratio
        let phi = (1.0 + 5.0f32.sqrt()) / 2.0;
        let resphi = 2.0 - phi;

        let mut x1 = a + resphi * (b - a);
        let mut x2 = b - resphi * (b - a);

        let mut f1 = -self.log_likelihood(values, x1);
        let mut f2 = -self.log_likelihood(values, x2);

        for _ in 0..self.max_iter {
            if (b - a).abs() < self.tolerance {
                break;
            }

            if f1 < f2 {
                b = x2;
                x2 = x1;
                f2 = f1;
                x1 = a + resphi * (b - a);
                f1 = -self.log_likelihood(values, x1);
            } else {
                a = x1;
                x1 = x2;
                f1 = f2;
                x2 = b - resphi * (b - a);
                f2 = -self.log_likelihood(values, x2);
            }
        }

        (a + b) / 2.0
    }

    /// Compute log-likelihood for a given lambda (higher is better)
    fn log_likelihood(&self, values: &[f32], lambda: f32) -> f32 {
        let n = values.len() as f32;
        if n == 0.0 {
            return f32::NEG_INFINITY;
        }

        // Transform values
        let transformed: Vec<f32> = values
            .iter()
            .map(|&x| yeo_johnson_transform(x, lambda))
            .collect();

        // Compute variance of transformed data
        let mean: f32 = transformed.iter().sum::<f32>() / n;
        let variance: f32 = transformed.iter().map(|&x| (x - mean).powi(2)).sum::<f32>() / n;

        if variance <= 0.0 || variance.is_nan() {
            return f32::NEG_INFINITY;
        }

        // Log-likelihood (simplified, ignoring constant terms)
        // LL = -n/2 * log(var) + (lambda - 1) * sum(sign(x) * log(|x| + 1))
        let jacobian_term: f32 = values
            .iter()
            .map(|&x| {
                let sign = if x >= 0.0 { 1.0 } else { -1.0 };
                (lambda - 1.0) * sign * (x.abs() + 1.0).ln()
            })
            .sum();

        -0.5 * n * variance.ln() + jacobian_term
    }
}

// =============================================================================
// Transform Functions
// =============================================================================

/// Apply Yeo-Johnson transform to a single value
#[inline]
pub fn yeo_johnson_transform(x: f32, lambda: f32) -> f32 {
    if x >= 0.0 {
        if lambda.abs() > 1e-10 {
            // y = ((x + 1)^λ - 1) / λ
            ((x + 1.0).powf(lambda) - 1.0) / lambda
        } else {
            // y = log(x + 1)
            (x + 1.0).ln()
        }
    } else {
        // x < 0
        let neg_x = -x;
        if (lambda - 2.0).abs() > 1e-10 {
            // y = -((-x + 1)^(2-λ) - 1) / (2 - λ)
            -((neg_x + 1.0).powf(2.0 - lambda) - 1.0) / (2.0 - lambda)
        } else {
            // y = -log(-x + 1)
            -(neg_x + 1.0).ln()
        }
    }
}

/// Apply inverse Yeo-Johnson transform to a single value
#[inline]
pub fn yeo_johnson_inverse(y: f32, lambda: f32) -> f32 {
    if y >= 0.0 {
        if lambda.abs() > 1e-10 {
            // x = (λ*y + 1)^(1/λ) - 1
            (lambda * y + 1.0).powf(1.0 / lambda) - 1.0
        } else {
            // x = exp(y) - 1
            y.exp() - 1.0
        }
    } else {
        // y < 0
        if (lambda - 2.0).abs() > 1e-10 {
            // x = 1 - ((2-λ)*(-y) + 1)^(1/(2-λ))
            1.0 - ((2.0 - lambda) * (-y) + 1.0).powf(1.0 / (2.0 - lambda))
        } else {
            // x = 1 - exp(-y)
            1.0 - (-y).exp()
        }
    }
}

// =============================================================================
// Tests
// =============================================================================

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_yeo_johnson_transform_positive() {
        // For positive values, should behave like Box-Cox with shift
        let x = 5.0;
        let lambda = 0.5;

        let y = yeo_johnson_transform(x, lambda);

        // Should be a positive, reasonable value
        assert!(y > 0.0);
        assert!(y.is_finite());
    }

    #[test]
    fn test_yeo_johnson_transform_negative() {
        // Should handle negative values
        let x = -5.0;
        let lambda = 0.5;

        let y = yeo_johnson_transform(x, lambda);

        // Should be negative
        assert!(y < 0.0);
        assert!(y.is_finite());
    }

    #[test]
    fn test_yeo_johnson_transform_zero_lambda() {
        // Lambda = 0 should be log transform for positive
        let x = 5.0;
        let y = yeo_johnson_transform(x, 0.0);

        let expected = (x + 1.0).ln();
        assert!((y - expected).abs() < 1e-5);
    }

    #[test]
    fn test_yeo_johnson_transform_lambda_two() {
        // Lambda = 2 should be -log for negative
        let x = -5.0;
        let y = yeo_johnson_transform(x, 2.0);

        let expected = -((-x) + 1.0).ln();
        assert!((y - expected).abs() < 1e-5);
    }

    #[test]
    fn test_yeo_johnson_inverse_positive() {
        let x = 5.0;
        let lambda = 0.5;

        let y = yeo_johnson_transform(x, lambda);
        let x_recovered = yeo_johnson_inverse(y, lambda);

        assert!((x - x_recovered).abs() < 1e-4);
    }

    #[test]
    fn test_yeo_johnson_inverse_negative() {
        let x = -5.0;
        let lambda = 0.5;

        let y = yeo_johnson_transform(x, lambda);
        let x_recovered = yeo_johnson_inverse(y, lambda);

        assert!((x - x_recovered).abs() < 1e-4);
    }

    #[test]
    fn test_yeo_johnson_fit() {
        let mut transform = YeoJohnsonTransform::new();

        // Skewed data - should find lambda that normalizes it
        let data = vec![0.1, 10.0, 1.0, 100.0, 5.0, 1000.0]; // 3 rows × 2 features

        transform.fit(&data, 2).unwrap();

        assert!(transform.is_fitted());
        assert_eq!(transform.lambdas().len(), 2);

        // Lambdas should be reasonable
        for &lambda in transform.lambdas() {
            assert!(lambda > -5.0 && lambda < 5.0);
        }
    }

    #[test]
    fn test_yeo_johnson_fit_transform() {
        let mut transform = YeoJohnsonTransform::new();

        let mut data = vec![0.1, 1.0, 10.0, 100.0]; // 2 rows × 2 features

        transform.fit_transform(&mut data, 2).unwrap();

        // Data should be transformed
        assert!(transform.is_fitted());

        // Values should be finite
        for &v in &data {
            assert!(v.is_finite());
        }
    }

    #[test]
    fn test_yeo_johnson_inverse_transform() {
        let mut transform = YeoJohnsonTransform::new();

        let original = vec![0.5, 2.0, 5.0, 10.0]; // 2 rows × 2 features
        let mut data = original.clone();

        transform.fit_transform(&mut data, 2).unwrap();
        transform.inverse_transform(&mut data, 2).unwrap();

        // Should recover original (within tolerance)
        for (orig, recovered) in original.iter().zip(data.iter()) {
            assert!(
                (orig - recovered).abs() < 0.01,
                "orig={}, recovered={}",
                orig,
                recovered
            );
        }
    }

    #[test]
    fn test_yeo_johnson_with_lambdas() {
        // Use fixed lambdas
        let transform = YeoJohnsonTransform::with_lambdas(vec![0.5, 1.0]);

        assert!(transform.is_fitted());
        assert_eq!(transform.lambdas(), &[0.5, 1.0]);
    }

    #[test]
    fn test_yeo_johnson_not_fitted_error() {
        let transform = YeoJohnsonTransform::new();
        let mut data = vec![1.0, 2.0];

        let result = transform.transform(&mut data, 2);
        assert!(result.is_err());
        assert!(result.unwrap_err().to_string().contains("not fitted"));
    }

    #[test]
    fn test_yeo_johnson_with_nan() {
        let mut transform = YeoJohnsonTransform::new();

        // Data with NaN
        let mut data = vec![1.0, f32::NAN, 5.0, 10.0];

        transform.fit_transform(&mut data, 2).unwrap();

        // NaN should remain NaN
        assert!(!data[0].is_nan()); // 1.0 transformed
        assert!(data[1].is_nan()); // Still NaN
        assert!(!data[2].is_nan()); // 5.0 transformed
        assert!(!data[3].is_nan()); // 10.0 transformed
    }

    #[test]
    fn test_yeo_johnson_serialization() {
        let mut transform = YeoJohnsonTransform::new();
        transform.fit(&[1.0, 2.0, 3.0, 4.0], 2).unwrap();

        let json = serde_json::to_string(&transform).unwrap();
        let loaded: YeoJohnsonTransform = serde_json::from_str(&json).unwrap();

        assert!(loaded.is_fitted());
        assert_eq!(loaded.lambdas(), transform.lambdas());
    }

    #[test]
    fn test_yeo_johnson_identity_lambda_one() {
        // Lambda = 1 should be close to identity-ish
        let x = 5.0;
        let y = yeo_johnson_transform(x, 1.0);

        // For lambda=1: y = (x+1)^1 - 1 = x
        assert!((y - x).abs() < 1e-5);
    }

    #[test]
    fn test_yeo_johnson_all_nan_column() {
        let mut transform = YeoJohnsonTransform::new();

        // Column 0 all NaN
        let data = vec![f32::NAN, 1.0, f32::NAN, 2.0];

        transform.fit(&data, 2).unwrap();

        // Should handle gracefully (lambda = 1.0 for NaN column)
        assert!(transform.is_fitted());
        assert_eq!(transform.lambdas().len(), 2);
    }
}