pub trait StreamingTargetPreprocessor: Send + Sync {
fn fit_transform(&mut self, target: f64) -> f64;
fn inverse_transform(&self, transformed: f64) -> f64;
fn reset(&mut self);
}
#[derive(Clone, Debug)]
pub struct TargetScaler {
count: u64,
mean: f64,
m2: f64,
variance_floor: f64,
}
impl TargetScaler {
pub fn new() -> Self {
Self {
count: 0,
mean: 0.0,
m2: 0.0,
variance_floor: 1e-10,
}
}
pub fn with_variance_floor(variance_floor: f64) -> Self {
assert!(
variance_floor > 0.0,
"TargetScaler: variance_floor must be > 0.0, got {}",
variance_floor
);
Self {
count: 0,
mean: 0.0,
m2: 0.0,
variance_floor,
}
}
pub fn mean(&self) -> f64 {
self.mean
}
pub fn std(&self) -> f64 {
self.variance().sqrt()
}
pub fn variance(&self) -> f64 {
if self.count < 2 {
self.variance_floor
} else {
(self.m2 / (self.count - 1) as f64).max(self.variance_floor)
}
}
}
impl Default for TargetScaler {
fn default() -> Self {
Self::new()
}
}
impl StreamingTargetPreprocessor for TargetScaler {
fn fit_transform(&mut self, target: f64) -> f64 {
self.count += 1;
let delta = target - self.mean;
self.mean += delta / self.count as f64;
let delta2 = target - self.mean;
self.m2 += delta * delta2;
(target - self.mean) / self.std()
}
fn inverse_transform(&self, transformed: f64) -> f64 {
transformed * self.std() + self.mean
}
fn reset(&mut self) {
self.count = 0;
self.mean = 0.0;
self.m2 = 0.0;
}
}
#[derive(Clone, Debug, Default)]
pub struct TargetLog1pTransform;
impl TargetLog1pTransform {
pub fn new() -> Self {
Self
}
}
impl StreamingTargetPreprocessor for TargetLog1pTransform {
fn fit_transform(&mut self, target: f64) -> f64 {
target.ln_1p()
}
fn inverse_transform(&self, transformed: f64) -> f64 {
transformed.exp_m1()
}
fn reset(&mut self) {
}
}
#[derive(Clone, Debug)]
pub struct TargetEncoderPreprocessor {
inner: crate::preprocessing::TargetEncoder,
last_features: Vec<f64>,
}
impl TargetEncoderPreprocessor {
pub fn new(inner: crate::preprocessing::TargetEncoder, initial_features: Vec<f64>) -> Self {
Self {
inner,
last_features: initial_features,
}
}
pub fn encoder(&self) -> &crate::preprocessing::TargetEncoder {
&self.inner
}
pub fn encoder_mut(&mut self) -> &mut crate::preprocessing::TargetEncoder {
&mut self.inner
}
pub fn set_features(&mut self, features: Vec<f64>) {
self.last_features = features;
}
}
impl StreamingTargetPreprocessor for TargetEncoderPreprocessor {
fn fit_transform(&mut self, target: f64) -> f64 {
self.inner.update(&self.last_features, target);
target }
fn inverse_transform(&self, _transformed: f64) -> f64 {
self.inner.global_mean()
}
fn reset(&mut self) {
self.inner.reset();
}
}
#[cfg(test)]
mod tests {
use super::*;
const EPS: f64 = 1e-9;
#[test]
fn target_scaler_fit_transform_then_inverse_recovers_original() {
let mut scaler = TargetScaler::new();
for &y in &[1.0_f64, 2.0, 3.0, 4.0, 5.0] {
scaler.fit_transform(y);
}
let test_targets = [0.0_f64, 6.0, 3.0, -1.0, 100.0];
for &original in &test_targets {
let transformed = scaler.fit_transform(original);
let recovered = scaler.inverse_transform(transformed);
assert!(
(recovered - original).abs() < EPS,
"target_scaler round-trip failed for {}: transformed={}, recovered={}",
original,
transformed,
recovered
);
}
}
#[test]
fn target_scaler_reset_clears_state() {
let mut scaler = TargetScaler::new();
for &y in &[10.0_f64, 20.0, 30.0] {
scaler.fit_transform(y);
}
assert!(scaler.mean() != 0.0);
scaler.reset();
assert_eq!(scaler.mean(), 0.0);
assert_eq!(scaler.variance(), scaler.variance_floor);
}
#[test]
fn target_scaler_cold_start_is_finite() {
let mut scaler = TargetScaler::new();
let t = scaler.fit_transform(42.0);
assert!(t.is_finite(), "first fit_transform should be finite");
let r = scaler.inverse_transform(t);
assert!(
(r - 42.0).abs() < EPS,
"cold-start round-trip failed: {}",
r
);
}
#[test]
fn target_scaler_accumulates_statistics_online() {
let mut scaler = TargetScaler::new();
let n = 100_usize;
let data: Vec<f64> = (0..n).map(|i| i as f64).collect();
for &y in &data {
scaler.fit_transform(y);
}
let ref_mean: f64 = data.iter().sum::<f64>() / n as f64;
let ref_var: f64 =
data.iter().map(|&y| (y - ref_mean).powi(2)).sum::<f64>() / (n - 1) as f64;
assert!(
(scaler.mean() - ref_mean).abs() < 1e-9,
"Welford mean diverged from reference: {} vs {}",
scaler.mean(),
ref_mean
);
assert!(
(scaler.variance() - ref_var).abs() < 1e-6,
"Welford variance diverged from reference: {} vs {}",
scaler.variance(),
ref_var
);
}
#[test]
fn target_log1p_transform_inverse_correct_for_positive() {
let mut t = TargetLog1pTransform::new();
let test_values = [0.0_f64, 0.5, 1.0, 10.0, 100.0, 1_000_000.0];
for &original in &test_values {
let transformed = t.fit_transform(original);
let recovered = t.inverse_transform(transformed);
assert!(
(recovered - original).abs() < EPS,
"log1p round-trip failed for {}: transformed={}, recovered={}",
original,
transformed,
recovered
);
}
}
#[test]
fn target_log1p_transform_boundary_zero() {
let mut t = TargetLog1pTransform::new();
let transformed = t.fit_transform(0.0);
assert_eq!(transformed, 0.0, "log1p(0+1) = log(1) = 0");
let recovered = t.inverse_transform(transformed);
assert!(
recovered.abs() < EPS,
"expm1(0) should be 0, got {}",
recovered
);
}
#[test]
fn target_log1p_transform_out_of_domain_produces_nan() {
let mut t = TargetLog1pTransform::new();
let result = t.fit_transform(-2.0);
assert!(
result.is_nan(),
"out-of-domain value should produce NaN, got {}",
result
);
}
#[test]
fn target_log1p_transform_stateless_reset_no_op() {
let mut t = TargetLog1pTransform::new();
t.fit_transform(5.0);
t.fit_transform(10.0);
t.reset(); let transformed = t.fit_transform(42.0);
let recovered = t.inverse_transform(transformed);
assert!(
(recovered - 42.0).abs() < EPS,
"after reset round-trip failed: {}",
recovered
);
}
#[test]
fn target_encoder_handles_categorical_streaming() {
use crate::preprocessing::TargetEncoder;
let enc = TargetEncoder::new(vec![0]);
let mut tp = TargetEncoderPreprocessor::new(enc, vec![1.0, 5.0]);
for _ in 0..50 {
tp.fit_transform(10.0);
}
tp.set_features(vec![2.0, 5.0]);
for _ in 0..50 {
tp.fit_transform(20.0);
}
let global_mean = tp.encoder().global_mean();
assert!(
(global_mean - 15.0).abs() < 0.5,
"global mean expected ~15.0, got {}",
global_mean
);
let approx = tp.inverse_transform(0.0);
assert!(
(approx - global_mean).abs() < EPS,
"inverse_transform should return global_mean, got {}",
approx
);
}
#[test]
fn target_encoder_preprocessor_reset_clears_encoder() {
use crate::preprocessing::TargetEncoder;
let enc = TargetEncoder::new(vec![0]);
let mut tp = TargetEncoderPreprocessor::new(enc, vec![1.0]);
for _ in 0..10 {
tp.fit_transform(5.0);
}
assert!(tp.encoder().global_mean() != 0.0);
tp.reset();
assert_eq!(tp.encoder().global_mean(), 0.0);
}
}