use treeboost::preprocessing::{
FrequencyEncoder, ImputeStrategy, LabelEncoder, MinMaxScaler, OneHotEncoder, OutlierAction,
OutlierDetector, OutlierMethod, RobustScaler, Scaler, SimpleImputer, StandardScaler,
TransformResult, UnknownStrategy,
};
#[test]
fn test_preprocessing_standard_scaler_workflow() {
let num_rows = 100;
let num_features = 3;
let mut data: Vec<f32> = Vec::with_capacity(num_rows * num_features);
for i in 0..num_rows {
data.push(i as f32 * 10.0); data.push((i as f32).powf(2.0)); data.push(1000.0 + i as f32 * 0.1); }
let mut scaler = StandardScaler::new();
scaler.fit(&data, num_features).expect("Fit should succeed");
assert!(scaler.is_fitted());
let mut transformed = data.clone();
scaler
.transform(&mut transformed, num_features)
.expect("Transform should succeed");
for f in 0..num_features {
let feature_vals: Vec<f32> = (0..num_rows)
.map(|r| transformed[r * num_features + f])
.collect();
let mean: f32 = feature_vals.iter().sum::<f32>() / num_rows as f32;
let var: f32 =
feature_vals.iter().map(|v| (v - mean).powi(2)).sum::<f32>() / num_rows as f32;
let std = var.sqrt();
assert!(
mean.abs() < 0.01,
"Feature {} mean should be ~0, got {}",
f,
mean
);
assert!(
(std - 1.0).abs() < 0.1,
"Feature {} std should be ~1, got {}",
f,
std
);
}
}
#[test]
fn test_preprocessing_minmax_scaler() {
let num_rows = 50;
let num_features = 2;
let mut data: Vec<f32> = Vec::with_capacity(num_rows * num_features);
for i in 0..num_rows {
data.push(i as f32); data.push(100.0 - i as f32); }
let mut scaler = MinMaxScaler::new();
scaler.fit(&data, num_features).expect("Fit should succeed");
let mut transformed = data.clone();
scaler
.transform(&mut transformed, num_features)
.expect("Transform should succeed");
for &v in &transformed {
assert!(v >= 0.0 && v <= 1.0, "Value {} not in [0, 1]", v);
}
for f in 0..num_features {
let feature_vals: Vec<f32> = (0..num_rows)
.map(|r| transformed[r * num_features + f])
.collect();
let min = feature_vals.iter().cloned().fold(f32::INFINITY, f32::min);
let max = feature_vals
.iter()
.cloned()
.fold(f32::NEG_INFINITY, f32::max);
assert!(
min.abs() < 0.001,
"Feature {} min should be 0, got {}",
f,
min
);
assert!(
(max - 1.0).abs() < 0.001,
"Feature {} max should be 1, got {}",
f,
max
);
}
}
#[test]
fn test_preprocessing_robust_scaler_outliers() {
let num_rows = 100;
let num_features = 1;
let mut data: Vec<f32> = Vec::with_capacity(num_rows * num_features);
for i in 0..num_rows {
if i < 95 {
data.push(i as f32 % 10.0); } else {
data.push(10000.0); }
}
let mut robust = RobustScaler::new();
robust.fit(&data, num_features).expect("Fit should succeed");
let mut transformed = data.clone();
robust
.transform(&mut transformed, num_features)
.expect("Transform should succeed");
let non_outlier_vals: Vec<f32> = transformed[0..95].to_vec();
let max_non_outlier = non_outlier_vals
.iter()
.cloned()
.fold(f32::NEG_INFINITY, f32::max);
let min_non_outlier = non_outlier_vals
.iter()
.cloned()
.fold(f32::INFINITY, f32::min);
assert!(
max_non_outlier < 5.0,
"Non-outliers should be scaled reasonably: {}",
max_non_outlier
);
assert!(
min_non_outlier > -5.0,
"Non-outliers should be scaled reasonably: {}",
min_non_outlier
);
}
#[test]
fn test_preprocessing_frequency_encoder_workflow() {
let categories = vec!["apple", "banana", "apple", "cherry", "apple", "banana"];
let mut encoder = FrequencyEncoder::new().with_normalize(true);
let _ = encoder.fit(&categories);
assert!(encoder.is_fitted());
assert_eq!(encoder.num_categories(), 3);
let transformed = encoder
.transform(&["apple", "banana", "cherry"])
.expect("Transform should succeed");
assert!((transformed[0] - 0.5).abs() < 0.01);
assert!((transformed[1] - 0.333).abs() < 0.01);
assert!((transformed[2] - 0.167).abs() < 0.01);
let unknown = encoder.transform_single("mango");
assert_eq!(unknown, Some(0.0), "Unknown category should return default");
}
#[test]
fn test_preprocessing_label_encoder_roundtrip() {
let categories = vec!["red", "green", "blue", "red", "green"];
let mut encoder = LabelEncoder::new();
let _ = encoder.fit(&categories);
let labels = encoder
.transform(&categories)
.expect("Transform should succeed");
let reversed = encoder
.inverse_transform(&labels)
.expect("Inverse should succeed");
assert_eq!(reversed, categories, "Roundtrip should preserve categories");
assert_eq!(encoder.get_label("blue"), Some(0));
assert_eq!(encoder.get_label("green"), Some(1));
assert_eq!(encoder.get_label("red"), Some(2));
}
#[test]
fn test_preprocessing_onehot_encoder_drop_first() {
let categories = vec!["A", "B", "C"];
let mut encoder = OneHotEncoder::new()
.with_drop_first(true)
.with_unknown_strategy(UnknownStrategy::AllZeros);
let _ = encoder.fit(&categories);
assert_eq!(encoder.num_columns(), 2);
let encoded = encoder
.transform(&["A", "B", "C", "unknown"])
.expect("Transform should succeed");
assert_eq!(
encoded,
vec![
0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, ]
);
}
#[test]
fn test_preprocessing_simple_imputer_strategies() {
let num_features = 2;
let data = vec![
1.0,
10.0,
2.0,
f32::NAN,
f32::NAN,
30.0,
4.0,
40.0,
5.0,
50.0,
6.0,
60.0,
7.0,
70.0,
8.0,
80.0,
9.0,
90.0,
10.0,
100.0,
];
let mut imputer = SimpleImputer::new(ImputeStrategy::Mean);
imputer
.fit(&data, num_features)
.expect("Fit should succeed");
let mut imputed = data.clone();
imputer
.transform(&mut imputed, num_features)
.expect("Transform should succeed");
for &v in &imputed {
assert!(!v.is_nan(), "NaN should be imputed");
}
assert!(
(imputed[4] - 5.78).abs() < 0.1,
"Row 2, f0 should be imputed to mean"
);
assert!(
(imputed[3] - 58.89).abs() < 0.1,
"Row 1, f1 should be imputed to mean"
);
}
use treeboost::preprocessing::{
EwmaGenerator, LagGenerator, NaNStrategy, RollingGenerator, RollingStat, SeasonalComponent,
SeasonalGenerator,
};
#[test]
fn test_timeseries_lag_generator_workflow() {
let num_rows = 30;
let num_features = 2;
let mut prices: Vec<f32> = Vec::with_capacity(num_rows * num_features);
for i in 0..num_rows {
prices.push(100.0 + (i as f32) * 0.5 + (i as f32 * 0.3).sin() * 5.0); prices.push(1000.0 + (i as f32) * 100.0); }
let gen = LagGenerator::new(vec![1, 7]);
let lagged = gen
.transform(&prices, num_features)
.expect("Transform should succeed");
assert_eq!(lagged.len(), 30 * 6);
let row10_start = 10 * 6;
let original_price = prices[10 * 2]; let lag1_price = prices[9 * 2]; let lag7_price = prices[3 * 2];
assert_eq!(lagged[row10_start], original_price);
assert_eq!(lagged[row10_start + 2], lag1_price); assert_eq!(lagged[row10_start + 4], lag7_price);
for row in 0..7 {
let lag7_idx = row * 6 + 4;
assert!(lagged[lag7_idx].is_nan(), "Row {} lag7 should be NaN", row);
}
}
#[test]
fn test_timeseries_lag_generator_nan_strategies() {
let data = vec![10.0, 20.0, 30.0, 40.0, 50.0];
let gen_keep = LagGenerator::new(vec![2]);
let result_keep = gen_keep.transform(&data, 1).unwrap();
assert!(result_keep[1].is_nan()); assert!(result_keep[3].is_nan());
let gen_ff = LagGenerator::new(vec![2]).with_nan_strategy(NaNStrategy::ForwardFill);
let result_ff = gen_ff.transform(&data, 1).unwrap();
assert_eq!(result_ff[1], 10.0); assert_eq!(result_ff[3], 10.0);
let gen_const = LagGenerator::new(vec![2]).with_nan_strategy(NaNStrategy::constant(0.0));
let result_const = gen_const.transform(&data, 1).unwrap();
assert_eq!(result_const[1], 0.0); }
#[test]
fn test_timeseries_rolling_generator_workflow() {
let num_rows = 100;
let data: Vec<f32> = (0..num_rows).map(|i| 100.0 + (i as f32) * 2.0).collect();
let gen = RollingGenerator::new(5)
.with_stats(vec![
RollingStat::Mean,
RollingStat::Std,
RollingStat::Min,
RollingStat::Max,
])
.with_min_periods(3);
let rolled = gen.transform(&data, 1).expect("Transform should succeed");
assert_eq!(rolled.len(), 100 * 5);
let row50_start = 50 * 5;
let expected_mean = (192.0 + 194.0 + 196.0 + 198.0 + 200.0) / 5.0; let expected_min = 192.0;
let expected_max = 200.0;
assert_eq!(rolled[row50_start], 200.0); assert!((rolled[row50_start + 1] - expected_mean).abs() < 0.01); assert!((rolled[row50_start + 3] - expected_min).abs() < 0.01); assert!((rolled[row50_start + 4] - expected_max).abs() < 0.01);
assert!(rolled[1].is_nan()); assert!(rolled[6].is_nan()); }
#[test]
fn test_timeseries_ewma_workflow() {
let data: Vec<f32> = (0..50)
.map(|i| 10.0 * i as f32 + ((i * 7) % 13) as f32)
.collect();
let gen = EwmaGenerator::new(0.3);
let smoothed = gen.transform(&data, 1).expect("Transform should succeed");
assert_eq!(smoothed.len(), 50);
let orig_var = variance(&data);
let smooth_var = variance(&smoothed);
assert!(
smooth_var < orig_var,
"EWMA should reduce variance: orig={}, smooth={}",
orig_var,
smooth_var
);
for &v in &smoothed {
assert!(v.is_finite(), "EWMA value should be finite");
}
}
#[test]
fn test_timeseries_seasonal_generator_workflow() {
let base_ts = 1705276800.0; let timestamps: Vec<f64> = (0..168).map(|h| base_ts + (h * 3600) as f64).collect();
let gen = SeasonalGenerator::new(vec![
SeasonalComponent::Hour,
SeasonalComponent::DayOfWeek,
SeasonalComponent::IsWeekend,
]);
let features = gen.transform_timestamps(×tamps);
assert_eq!(features.len(), 168 * 3);
assert_eq!(features[0], 0.0); assert_eq!(features[12 * 3], 12.0); assert_eq!(features[23 * 3], 23.0); assert_eq!(features[24 * 3], 0.0);
assert_eq!(features[1], 0.0); assert_eq!(features[24 * 3 + 1], 1.0); assert_eq!(features[5 * 24 * 3 + 1], 5.0);
assert_eq!(features[2], 0.0); assert_eq!(features[5 * 24 * 3 + 2], 1.0); assert_eq!(features[6 * 24 * 3 + 2], 1.0); }
#[test]
fn test_timeseries_seasonal_cyclical_encoding() {
let gen = SeasonalGenerator::new(vec![SeasonalComponent::Hour]).with_cyclical(true);
assert_eq!(gen.num_features(), 2);
let base_ts = 1705276800.0; let timestamps = vec![
base_ts, base_ts + 21600.0, base_ts + 43200.0, base_ts + 64800.0, ];
let features = gen.transform_timestamps(×tamps);
assert_eq!(features.len(), 8);
assert!(features[0].abs() < 0.01); assert!((features[1] - 1.0).abs() < 0.01);
assert!((features[2] - 1.0).abs() < 0.01); assert!(features[3].abs() < 0.01);
assert!(features[4].abs() < 0.01); assert!((features[5] + 1.0).abs() < 0.01);
assert!((features[6] + 1.0).abs() < 0.01); assert!(features[7].abs() < 0.01); }
#[test]
fn test_timeseries_combined_feature_engineering() {
let num_rows = 60;
let data: Vec<f32> = (0..num_rows)
.map(|i| {
let trend = 100.0 + (i as f32) * 0.5;
let seasonal = 10.0 * ((i % 7) as f32 / 3.0).sin();
let noise = ((i * 17) % 11) as f32 - 5.0;
trend + seasonal + noise
})
.collect();
let lag_gen = LagGenerator::new(vec![1, 7]);
let lagged = lag_gen
.transform(&data, 1)
.expect("Lag transform should succeed");
let roll_gen = RollingGenerator::new(7)
.with_stats(vec![RollingStat::Mean, RollingStat::Std])
.with_min_periods(3);
let rolled = roll_gen
.transform(&data, 1)
.expect("Roll transform should succeed");
assert_eq!(lagged.len() / 3, num_rows); assert_eq!(rolled.len() / 3, num_rows);
for row in 7..num_rows {
let lag_start = row * 3;
assert!(
!lagged[lag_start + 1].is_nan(),
"Lag1 at row {} should be valid",
row
);
assert!(
!lagged[lag_start + 2].is_nan(),
"Lag7 at row {} should be valid",
row
);
let roll_start = row * 3;
assert!(
!rolled[roll_start + 1].is_nan(),
"Rolling mean at row {} should be valid",
row
);
assert!(
!rolled[roll_start + 2].is_nan(),
"Rolling std at row {} should be valid",
row
);
}
}
fn variance(data: &[f32]) -> f32 {
let valid: Vec<f32> = data.iter().filter(|x| x.is_finite()).cloned().collect();
if valid.is_empty() {
return 0.0;
}
let mean: f32 = valid.iter().sum::<f32>() / valid.len() as f32;
valid.iter().map(|x| (x - mean).powi(2)).sum::<f32>() / valid.len() as f32
}
#[test]
fn test_outlier_iqr_cap_workflow() {
let mut data: Vec<f32> = Vec::with_capacity(102 * 2);
for i in 0..100 {
data.push(i as f32); data.push((i as f32 * 2.0) + 10.0); }
data.push(1000.0); data.push(100.0); data.push(-500.0); data.push(5000.0);
let mut detector = OutlierDetector::new(OutlierMethod::iqr()).with_action(OutlierAction::Cap);
detector.fit(&data, 2).expect("Fit should succeed");
let names = vec!["f0".into(), "f1".into()];
let result = detector
.transform(&mut data, 2, &names)
.expect("Transform should succeed");
if let TransformResult::Capped { outlier_count } = result {
assert!(outlier_count >= 2, "Should have capped at least 2 outliers");
} else {
panic!("Expected Capped result");
}
for feat in 0..2 {
let bounds = &detector.bounds()[feat];
for row in 0..102 {
let val = data[row * 2 + feat];
assert!(
val >= bounds.lower && val <= bounds.upper,
"Value {} should be within bounds [{}, {}]",
val,
bounds.lower,
bounds.upper
);
}
}
}
#[test]
fn test_outlier_zscore_flag_workflow() {
let num_rows = 50;
let mut data: Vec<f32> = Vec::with_capacity(num_rows * 2);
for i in 0..num_rows - 2 {
data.push(50.0 + (i as f32 % 10.0) - 5.0); data.push(100.0 + (i as f32 % 5.0)); }
data.push(200.0); data.push(102.0); data.push(47.0); data.push(500.0);
let mut detector =
OutlierDetector::new(OutlierMethod::zscore()).with_action(OutlierAction::Flag);
detector.fit(&data, 2).expect("Fit should succeed");
let names = vec!["feature_0".into(), "feature_1".into()];
let result = detector
.transform(&mut data, 2, &names)
.expect("Transform should succeed");
if let TransformResult::Flagged {
indicators,
indicator_names,
} = result
{
assert_eq!(indicator_names.len(), 2);
assert_eq!(indicator_names[0], "feature_0_outlier");
assert_eq!(indicator_names[1], "feature_1_outlier");
let flagged: usize = indicators.iter().filter(|&&v| v > 0.0).count();
assert!(
flagged >= 2,
"Should flag at least 2 outliers, got {}",
flagged
);
} else {
panic!("Expected Flagged result");
}
}
#[test]
fn test_outlier_remove_workflow() {
let mut data: Vec<f32> = Vec::new();
for i in 0..10 {
data.push(i as f32 + 1.0); data.push((i as f32 + 1.0) * 10.0); }
data.push(1000.0); data.push(50.0); data.push(5.0); data.push(10000.0);
let mut detector =
OutlierDetector::new(OutlierMethod::iqr()).with_action(OutlierAction::Remove);
detector.fit(&data, 2).expect("Fit should succeed");
let names = vec!["f0".into(), "f1".into()];
let result = detector
.transform(&mut data, 2, &names)
.expect("Transform should succeed");
if let TransformResult::Removed {
cleaned_data,
kept_indices,
removed_count,
} = result
{
assert!(removed_count >= 1, "Should remove at least 1 row");
assert!(kept_indices.len() < 12, "Should have fewer than 12 rows");
assert_eq!(
cleaned_data.len(),
kept_indices.len() * 2,
"Cleaned data should match kept indices"
);
for row in 0..kept_indices.len() {
for feat in 0..2 {
let val = cleaned_data[row * 2 + feat];
assert!(
!detector.is_outlier(val, feat),
"Cleaned data should have no outliers"
);
}
}
} else {
panic!("Expected Removed result");
}
}
#[test]
fn test_outlier_then_scale_pipeline() {
let mut data: Vec<f32> = Vec::new();
for i in 0..50 {
data.push(i as f32 * 2.0); }
data.push(10000.0);
let mut detector = OutlierDetector::new(OutlierMethod::iqr()).with_action(OutlierAction::Cap);
detector.fit(&data, 1).expect("Fit should succeed");
detector
.transform(&mut data, 1, &["f0".into()])
.expect("Transform should succeed");
let mut scaler = StandardScaler::new();
scaler.fit(&data, 1).expect("Fit should succeed");
scaler
.transform(&mut data, 1)
.expect("Transform should succeed");
for val in &data {
assert!(
val.abs() < 10.0,
"Scaled values should be reasonable, got {}",
val
);
}
let mean: f32 = data.iter().sum::<f32>() / data.len() as f32;
assert!(mean.abs() < 0.1, "Mean should be ~0, got {}", mean);
}
#[test]
fn test_outlier_multifeature() {
let num_rows = 20;
let num_features = 4;
let mut data: Vec<f32> = Vec::with_capacity(num_rows * num_features);
for i in 0..num_rows - 1 {
data.push(i as f32); data.push(i as f32 * 2.0); data.push(100.0 - i as f32); data.push((i % 5) as f32); }
data.push(1000.0); data.push(5000.0); data.push(50.0); data.push(2.0);
let mut detector = OutlierDetector::new(OutlierMethod::iqr());
detector
.fit(&data, num_features)
.expect("Fit should succeed");
let counts = detector
.outlier_counts(&data, num_features)
.expect("Count should succeed");
assert_eq!(counts.len(), num_features);
assert!(counts[0] >= 1, "f0 should have outliers");
assert!(counts[1] >= 1, "f1 should have outliers");
}