use crate::error::{StatsError, StatsResult};
use crate::regression::utils::*;
use crate::regression::{linregress, RegressionResults};
use scirs2_core::ndarray::{Array1, Array2, ArrayView1, ArrayView2};
use scirs2_core::numeric::Float;
use scirs2_linalg::{inv, lstsq};
pub struct TheilSlopesResult<F>
where
F: Float + std::fmt::Debug + std::fmt::Display + 'static,
{
pub slope: F,
pub intercept: F,
pub slope_low: F,
pub slope_high: F,
pub slope_stderr: F,
}
impl<F> TheilSlopesResult<F>
where
F: Float + std::fmt::Debug + std::fmt::Display + 'static,
{
pub fn summary(&self) -> String {
let mut summary = String::new();
summary.push_str("=== Theil-Sen Regression Results ===\n\n");
summary.push_str(&format!(
"Formula: y = {:.6} + {:.6}*x\n\n",
self.intercept, self.slope
));
summary.push_str("Parameters:\n");
summary.push_str(&format!(" Slope: {:.6}\n", self.slope));
summary.push_str(&format!(" Intercept: {:.6}\n\n", self.intercept));
summary.push_str("Slope Statistics:\n");
summary.push_str(&format!(" Std. Error: {:.6}\n", self.slope_stderr));
summary.push_str(&format!(
" 95% CI: [{:.6}, {:.6}]\n",
self.slope_low, self.slope_high
));
summary.push_str("\nNote: Theil-Sen is a robust non-parametric estimator\n");
summary.push_str("that is resistant to outliers in the data.\n");
summary
}
}
pub struct HuberT<F>
where
F: Float + 'static + std::fmt::Display,
{
pub t: F,
pub weight_func: fn(F, F) -> F,
}
impl<F> HuberT<F>
where
F: Float + 'static + std::fmt::Display,
{
pub fn new() -> Self {
HuberT {
t: F::from(1.345).expect("Failed to convert constant to float"),
weight_func: huber_weight,
}
}
pub fn with_t(t: F) -> Self {
HuberT {
t,
weight_func: huber_weight,
}
}
pub fn loss(&self, r: F) -> F {
let abs_r = crate::regression::utils::float_abs(r);
if abs_r <= self.t {
F::from(0.5).expect("Failed to convert constant to float")
* crate::regression::utils::float_powi(r, 2)
} else {
self.t * abs_r
- F::from(0.5).expect("Failed to convert constant to float")
* crate::regression::utils::float_powi(self.t, 2)
}
}
}
impl<F> Default for HuberT<F>
where
F: Float + 'static + std::fmt::Display,
{
fn default() -> Self {
Self::new()
}
}
#[allow(dead_code)]
fn huber_weight<F>(r: F, t: F) -> F
where
F: Float + 'static + std::fmt::Display,
{
let abs_r = crate::regression::utils::float_abs(r);
if abs_r <= t {
F::one()
} else {
t / abs_r
}
}
#[allow(dead_code)]
pub fn theilslopes<F>(
x: &ArrayView1<F>,
y: &ArrayView1<F>,
alpha: Option<F>,
method: Option<&str>,
) -> StatsResult<TheilSlopesResult<F>>
where
F: Float
+ std::iter::Sum<F>
+ std::ops::Div<Output = F>
+ std::fmt::Debug
+ std::fmt::Display
+ 'static
+ Send
+ Sync,
{
if x.len() != y.len() {
return Err(StatsError::DimensionMismatch(format!(
"Input x has length {} but y has length {}",
x.len(),
y.len()
)));
}
let n = x.len();
if n < 2 {
return Err(StatsError::InvalidArgument(
"At least 2 data points are required for Theil-Sen regression".to_string(),
));
}
let alpha =
alpha.unwrap_or_else(|| F::from(0.95).expect("Failed to convert constant to float"));
let method = method.unwrap_or("approximate");
let repeated_x = find_repeats(x);
if !repeated_x.is_empty() {
let mut x_vals = Vec::new();
let mut y_vals = Vec::new();
for i in 0..n {
let mut is_repeated = false;
for repeats in &repeated_x {
if repeats.contains(&i) {
is_repeated = true;
break;
}
}
if !is_repeated {
x_vals.push(x[i]);
y_vals.push(y[i]);
}
}
for repeats in &repeated_x {
let x_val = x[repeats[0]];
let mut y_sum = F::zero();
for &idx in repeats {
y_sum = y_sum + y[idx];
}
let y_avg = y_sum / F::from(repeats.len()).expect("Operation failed");
x_vals.push(x_val);
y_vals.push(y_avg);
}
let x_arr = Array1::from(x_vals);
let y_arr = Array1::from(y_vals);
return theilslopes(&x_arr.view(), &y_arr.view(), Some(alpha), Some(method));
}
let slope = compute_median_slope(x, y);
let x_median = compute_median(x);
let y_median = compute_median(y);
let intercept = y_median - slope * x_median;
let z =
norm_ppf(F::from(0.5).expect("Failed to convert constant to float") * (F::one() + alpha));
let n_f = F::from(n).expect("Failed to convert to float");
let slope_stderr = match method {
"exact" => {
F::from(1.0).expect("Failed to convert constant to float")
/ (F::from(6.0).expect("Failed to convert constant to float")
* scirs2_core::numeric::Float::sqrt(n_f))
}
_ => {
let factor = scirs2_core::numeric::Float::sqrt(
F::from(3.0).expect("Failed to convert constant to float") * n_f * (n_f + F::one())
/ (F::from(0.5).expect("Failed to convert constant to float")
* n_f
* (n_f - F::one())),
);
factor
/ (scirs2_core::numeric::Float::sqrt(n_f)
* scirs2_core::numeric::Float::sqrt(n_f - F::one()))
}
};
let quant = z * slope_stderr;
let slope_low = slope - quant;
let slope_high = slope + quant;
Ok(TheilSlopesResult {
slope,
intercept,
slope_low,
slope_high,
slope_stderr,
})
}
#[allow(dead_code)]
fn compute_median<F>(x: &ArrayView1<F>) -> F
where
F: Float + 'static + std::fmt::Display,
{
let n = x.len();
if n == 0 {
return F::zero();
}
let mut sorted = x.to_owned();
sorted
.as_slice_mut()
.expect("Operation failed")
.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
let mid = n / 2;
if n.is_multiple_of(2) {
(sorted[mid - 1] + sorted[mid]) / F::from(2.0).expect("Failed to convert constant to float")
} else {
sorted[mid]
}
}
#[allow(dead_code)]
pub fn ransac<F>(
x: &ArrayView2<F>,
y: &ArrayView1<F>,
min_samples_: Option<usize>,
residual_threshold: Option<F>,
max_trials: Option<usize>,
stop_probability: Option<F>,
random_seed: Option<u64>,
) -> StatsResult<RegressionResults<F>>
where
F: Float
+ std::iter::Sum<F>
+ std::ops::Div<Output = F>
+ std::fmt::Debug
+ std::fmt::Display
+ 'static
+ scirs2_core::numeric::NumAssign
+ scirs2_core::numeric::One
+ scirs2_core::ndarray::ScalarOperand
+ Send
+ Sync,
{
use scirs2_core::random::SliceRandom;
if x.len() != y.len() {
return Err(StatsError::DimensionMismatch(format!(
"Input x has {} rows but y has length {}",
x.len(),
y.len()
)));
}
let n = x.len();
let min_samples_ = min_samples_.unwrap_or(2);
let max_trials = max_trials.unwrap_or(100);
let stop_probability = stop_probability
.unwrap_or_else(|| F::from(0.99).expect("Failed to convert constant to float"));
if n < min_samples_ {
return Err(StatsError::InvalidArgument(format!(
"Number of data points ({}) must be at least min_samples_ ({})",
n, min_samples_
)));
}
let _x_design =
Array2::from_shape_fn((n, 2), |(i, j)| if j == 0 { F::one() } else { x[[i, 0]] });
let residual_threshold = if let Some(rt) = residual_threshold {
rt
} else {
let x_vec = Array1::from_shape_fn(n, |i| x[[i, 0]]);
let (slope, intercept___, _, _, _) = linregress(&x_vec.view(), y)?;
let residuals = y
.iter()
.enumerate()
.map(|(i, &yi)| yi - (intercept___ + slope * x[[i, 0]]))
.collect::<Vec<F>>();
let residuals_array = Array1::from(residuals);
let mad = crate::regression::utils::median_abs_deviation_from_zero(&residuals_array.view());
mad * F::from(2.5).expect("Failed to convert constant to float") };
use scirs2_core::random::Random;
let mut rng = if let Some(_seed) = random_seed {
Random::seed(_seed)
} else {
use scirs2_core::random::{Rng, RngExt};
let mut temp_rng = scirs2_core::random::thread_rng();
Random::seed(temp_rng.random())
};
let mut best_model = None;
let mut best_inlier_count = 0;
let mut best_inlier_mask = vec![false; n];
let mut n_trials = max_trials;
for trial in 0..max_trials {
let mut indices: Vec<usize> = (0..n).collect();
indices.shuffle(&mut rng);
let sample_indices = indices[0..min_samples_].to_vec();
let sample_x = Array2::from_shape_fn((min_samples_, 2), |(i, j)| {
if j == 0 {
F::one()
} else {
x[[sample_indices[i], 0]]
}
});
let sample_y = Array1::from_shape_fn(min_samples_, |i| y[sample_indices[i]]);
let mut max_val = F::zero();
let mut min_val = F::infinity();
for &val in sample_x.column(1).iter() {
if crate::regression::utils::float_abs(val)
> crate::regression::utils::float_abs(max_val)
{
max_val = val;
}
if crate::regression::utils::float_abs(val)
< crate::regression::utils::float_abs(min_val)
{
min_val = val;
}
}
let sample_x_range = max_val - min_val;
if sample_x_range < F::epsilon() {
continue;
}
let model = match fit_linear_model(&sample_x.view(), &sample_y.view()) {
Ok(m) => m,
Err(_) => continue, };
let intercept = model[0];
let slope = model[1];
let residuals = y
.iter()
.zip(x.iter())
.map(|(&yi, &xi)| crate::regression::utils::float_abs(yi - (intercept + slope * xi)))
.collect::<Vec<F>>();
let mut inlier_mask = vec![false; n];
let mut inlier_count = 0;
for i in 0..n {
if crate::regression::utils::float_abs(residuals[i]) < residual_threshold {
inlier_mask[i] = true;
inlier_count += 1;
}
}
if inlier_count > best_inlier_count {
best_model = Some(model);
best_inlier_count = inlier_count;
best_inlier_mask = inlier_mask;
if inlier_count > min_samples_ {
let inlier_ratio = F::from(inlier_count).expect("Failed to convert to float")
/ F::from(n).expect("Failed to convert to float");
let power_term =
crate::regression::utils::float_powi(inlier_ratio, min_samples_ as i32);
let denom = F::one() - power_term;
if denom > F::epsilon() {
let numerator = crate::regression::utils::float_ln(F::one() - stop_probability);
let denominator = crate::regression::utils::float_ln(F::one() - power_term);
let new_n_trials = numerator / denominator;
n_trials = new_n_trials
.to_usize()
.unwrap_or(max_trials)
.min(max_trials);
}
}
}
if trial >= n_trials - 1 {
break;
}
}
if best_model.is_none() {
return Err(StatsError::ComputationError(
"RANSAC could not find a valid model".to_string(),
));
}
let inlier_indices: Vec<usize> = best_inlier_mask
.iter()
.enumerate()
.filter_map(|(i, &is_inlier)| if is_inlier { Some(i) } else { None })
.collect();
let inlier_count = inlier_indices.len();
if inlier_count < 2 {
return Err(StatsError::ComputationError(
"RANSAC found too few inliers to fit a model".to_string(),
));
}
let inlier_x = Array2::from_shape_fn((inlier_count, 2), |(i, j)| {
if j == 0 {
F::one()
} else {
x[[inlier_indices[i], 0]]
}
});
let inlier_y = Array1::from_shape_fn(inlier_count, |i| y[inlier_indices[i]]);
let mut regression_result = simple_linear_regression(&inlier_x.view(), &inlier_y.view())?;
regression_result.inlier_mask = best_inlier_mask;
Ok(regression_result)
}
#[allow(dead_code)]
fn fit_linear_model<F>(x: &ArrayView2<F>, y: &ArrayView1<F>) -> StatsResult<Array1<F>>
where
F: Float
+ std::iter::Sum<F>
+ std::ops::Div<Output = F>
+ 'static
+ scirs2_core::numeric::NumAssign
+ scirs2_core::numeric::One
+ scirs2_core::ndarray::ScalarOperand
+ std::fmt::Display
+ Send
+ Sync,
{
match lstsq(x, y, None) {
Ok(result) => Ok(result.x),
Err(e) => Err(StatsError::ComputationError(format!(
"Least squares computation failed: {:?}",
e
))),
}
}
#[allow(dead_code)]
fn simple_linear_regression<F>(
x: &ArrayView2<F>,
y: &ArrayView1<F>,
) -> StatsResult<RegressionResults<F>>
where
F: Float
+ std::iter::Sum<F>
+ std::ops::Div<Output = F>
+ std::fmt::Debug
+ std::fmt::Display
+ 'static
+ scirs2_core::numeric::NumAssign
+ scirs2_core::numeric::One
+ scirs2_core::ndarray::ScalarOperand
+ Send
+ Sync,
{
let n = x.nrows();
let p = x.ncols();
let coefficients = match lstsq(x, y, None) {
Ok(result) => result.x,
Err(e) => {
return Err(StatsError::ComputationError(format!(
"Least squares computation failed: {:?}",
e
)))
}
};
let fitted_values = x.dot(&coefficients);
let residuals = y.to_owned() - &fitted_values;
let df_model = p - 1;
let df_residuals = n - p;
let (_y_mean, ss_total, ss_residual, ss_explained) =
calculate_sum_of_squares(y, &residuals.view());
let r_squared = ss_explained / ss_total;
let adj_r_squared = F::one()
- (F::one() - r_squared) * F::from(n - 1).expect("Failed to convert to float")
/ F::from(df_residuals).expect("Failed to convert to float");
let mse = ss_residual / F::from(df_residuals).expect("Failed to convert to float");
let residual_std_error = scirs2_core::numeric::Float::sqrt(mse);
let std_errors = match calculate_std_errors(x, &residuals.view(), df_residuals) {
Ok(se) => se,
Err(_) => Array1::<F>::zeros(p),
};
let t_values = calculate_t_values(&coefficients, &std_errors);
let p_values = t_values.mapv(|t| {
let t_abs = scirs2_core::numeric::Float::abs(t);
let df_f = F::from(df_residuals).expect("Failed to convert to float");
F::from(2.0).expect("Failed to convert constant to float")
* (F::one() - t_abs / scirs2_core::numeric::Float::sqrt(df_f + t_abs * t_abs))
});
let mut conf_intervals = Array2::<F>::zeros((p, 2));
for i in 0..p {
let margin = std_errors[i] * F::from(1.96).expect("Failed to convert constant to float"); conf_intervals[[i, 0]] = coefficients[i] - margin;
conf_intervals[[i, 1]] = coefficients[i] + margin;
}
let f_statistic = if df_model > 0 && df_residuals > 0 {
(ss_explained / F::from(df_model).expect("Failed to convert to float"))
/ (ss_residual / F::from(df_residuals).expect("Failed to convert to float"))
} else {
F::infinity()
};
let f_p_value = F::zero();
Ok(RegressionResults {
coefficients,
std_errors,
t_values,
p_values,
conf_intervals,
r_squared,
adj_r_squared,
f_statistic,
f_p_value,
residual_std_error,
df_residuals,
residuals,
fitted_values,
inlier_mask: vec![true; n], })
}
#[allow(clippy::too_many_arguments)]
#[allow(dead_code)]
pub fn huber_regression<F>(
x: &ArrayView2<F>,
y: &ArrayView1<F>,
epsilon: Option<F>,
fit_intercept: Option<bool>,
scale: Option<F>,
max_iter: Option<usize>,
tol: Option<F>,
conf_level: Option<F>,
) -> StatsResult<RegressionResults<F>>
where
F: Float
+ std::iter::Sum<F>
+ std::ops::Div<Output = F>
+ std::fmt::Debug
+ std::fmt::Display
+ 'static
+ scirs2_core::numeric::NumAssign
+ scirs2_core::numeric::One
+ Send
+ Sync
+ scirs2_core::ndarray::ScalarOperand,
{
if x.nrows() != y.len() {
return Err(StatsError::DimensionMismatch(format!(
"Input x has {} rows but y has length {}",
x.nrows(),
y.len()
)));
}
let n = x.nrows();
let _p_features = x.ncols();
let epsilon =
epsilon.unwrap_or_else(|| F::from(1.345).expect("Failed to convert constant to float"));
let fit_intercept = fit_intercept.unwrap_or(true);
let max_iter = max_iter.unwrap_or(50);
let tol = tol.unwrap_or_else(|| F::from(1e-5).expect("Failed to convert constant to float"));
let conf_level =
conf_level.unwrap_or_else(|| F::from(0.95).expect("Failed to convert constant to float"));
let x_design = if fit_intercept {
add_intercept(x)
} else {
x.to_owned()
};
let p = x_design.ncols();
if n <= p {
return Err(StatsError::InvalidArgument(format!(
"Number of observations ({}) must be greater than number of parameters ({})",
n, p
)));
}
let initial_coeffs = fit_ols(&x_design.view(), y)?;
let initial_residuals = y.to_owned() - &x_design.dot(&initial_coeffs);
let mut sigma = if let Some(s) = scale {
s
} else {
let mad = median_abs_deviation_from_zero(&initial_residuals.view());
mad / F::from(0.6745).expect("Failed to convert constant to float")
};
let huber = HuberT::with_t(epsilon);
let mut coefficients = initial_coeffs;
let mut residuals = initial_residuals;
let mut weights = Array1::<F>::ones(n);
for _ in 0..max_iter {
let mut weight_sum = F::zero();
for i in 0..n {
let weight = (huber.weight_func)(residuals[i] / sigma, huber.t);
weights[i] = weight;
weight_sum += weight;
}
if weight_sum / F::from(n).expect("Failed to convert to float") > F::one() - tol {
break;
}
let sqrt_weights = weights.mapv(|w| scirs2_core::numeric::Float::sqrt(w));
let sqrt_weights_col = sqrt_weights
.clone()
.insert_axis(scirs2_core::ndarray::Axis(1));
let x_weighted = &x_design * &sqrt_weights_col;
let y_weighted = &y.to_owned() * &sqrt_weights;
let new_coeffs = fit_ols(&x_weighted.view(), &y_weighted.view())?;
let new_residuals = y - &x_design.dot(&new_coeffs);
sigma = crate::regression::utils::float_sqrt(
new_residuals
.iter()
.map(|&r| crate::regression::utils::float_powi(r, 2))
.sum::<F>()
/ F::from(n).expect("Failed to convert to float"),
);
let coef_change = (&new_coeffs - &coefficients)
.mapv(|x| crate::regression::utils::float_abs(x))
.sum()
/ coefficients
.mapv(|x| crate::regression::utils::float_abs(x))
.sum();
coefficients = new_coeffs;
residuals = new_residuals;
if coef_change < tol {
break;
}
}
let fitted_values = x_design.dot(&coefficients);
let df_model = p - if fit_intercept { 1 } else { 0 };
let df_residuals = n - p;
let y_mean = y.iter().cloned().sum::<F>() / F::from(n).expect("Failed to convert to float");
let ss_total = y
.iter()
.map(|&yi| scirs2_core::numeric::Float::powi(yi - y_mean, 2))
.sum::<F>();
let ss_residual = residuals
.iter()
.map(|&ri| scirs2_core::numeric::Float::powi(ri, 2))
.sum::<F>();
let ss_explained = ss_total - ss_residual;
let r_squared = ss_explained / ss_total;
let adj_r_squared = F::one()
- (F::one() - r_squared) * F::from(n - 1).expect("Failed to convert to float")
/ F::from(df_residuals).expect("Failed to convert to float");
let mse = ss_residual / F::from(df_residuals).expect("Failed to convert to float");
let residual_std_error = scirs2_core::numeric::Float::sqrt(mse);
let std_errors = match calculate_huber_std_errors(
&x_design.view(),
&residuals.view(),
&weights.view(),
sigma,
df_residuals,
) {
Ok(se) => se,
Err(_) => Array1::<F>::zeros(p),
};
let t_values = calculate_t_values(&coefficients, &std_errors);
let p_values = t_values.mapv(|t| {
let t_abs = scirs2_core::numeric::Float::abs(t);
let df_f = F::from(df_residuals).expect("Failed to convert to float");
F::from(2.0).expect("Failed to convert constant to float")
* (F::one() - t_abs / scirs2_core::numeric::Float::sqrt(df_f + t_abs * t_abs))
});
let mut conf_intervals = Array2::<F>::zeros((p, 2));
let z = norm_ppf(
F::from(0.5).expect("Failed to convert constant to float") * (F::one() + conf_level),
);
for i in 0..p {
let margin = std_errors[i] * z;
conf_intervals[[i, 0]] = coefficients[i] - margin;
conf_intervals[[i, 1]] = coefficients[i] + margin;
}
let f_statistic = if df_model > 0 && df_residuals > 0 {
(ss_explained / F::from(df_model).expect("Failed to convert to float"))
/ (ss_residual / F::from(df_residuals).expect("Failed to convert to float"))
} else {
F::infinity()
};
let f_p_value = F::zero();
Ok(RegressionResults {
coefficients,
std_errors,
t_values,
p_values,
conf_intervals,
r_squared,
adj_r_squared,
f_statistic,
f_p_value,
residual_std_error,
df_residuals,
residuals,
fitted_values,
inlier_mask: vec![true; n], })
}
#[allow(dead_code)]
fn fit_ols<F>(x: &ArrayView2<F>, y: &ArrayView1<F>) -> StatsResult<Array1<F>>
where
F: Float
+ std::iter::Sum<F>
+ std::ops::Div<Output = F>
+ 'static
+ scirs2_core::numeric::NumAssign
+ scirs2_core::numeric::One
+ scirs2_core::ndarray::ScalarOperand
+ std::fmt::Display
+ Send
+ Sync,
{
match lstsq(x, y, None) {
Ok(result) => Ok(result.x),
Err(e) => Err(StatsError::ComputationError(format!(
"Least squares computation failed: {:?}",
e
))),
}
}
#[allow(dead_code)]
fn calculate_huber_std_errors<F>(
x: &ArrayView2<F>,
_residuals: &ArrayView1<F>,
weights: &ArrayView1<F>,
sigma: F,
_df: usize,
) -> StatsResult<Array1<F>>
where
F: Float
+ std::iter::Sum<F>
+ std::ops::Div<Output = F>
+ 'static
+ scirs2_core::numeric::NumAssign
+ scirs2_core::numeric::One
+ scirs2_core::ndarray::ScalarOperand
+ std::fmt::Display
+ Send
+ Sync,
{
let mut xtx = Array2::<F>::zeros((x.ncols(), x.ncols()));
for i in 0..x.nrows() {
let weight = weights[i];
let xi = x.row(i);
for j in 0..x.ncols() {
for k in 0..x.ncols() {
xtx[[j, k]] += weight * xi[j] * xi[k];
}
}
}
let xtx_inv = match inv(&xtx.view(), None) {
Ok(inv_result) => inv_result,
Err(_) => {
return Ok(Array1::<F>::zeros(x.ncols()));
}
};
let std_errors = xtx_inv.diag().mapv(|v| {
scirs2_core::numeric::Float::sqrt(v * scirs2_core::numeric::Float::powi(sigma, 2))
});
Ok(std_errors)
}
pub struct LtsResult<F>
where
F: Float + std::fmt::Debug + std::fmt::Display + 'static,
{
pub coefficients: Array1<F>,
pub inlier_mask: Vec<bool>,
pub n_support: usize,
pub trimmed_sse: F,
pub r_squared: F,
pub residuals: Array1<F>,
}
#[allow(clippy::too_many_arguments)]
pub fn lts_regression<F>(
x: &ArrayView2<F>,
y: &ArrayView1<F>,
trim_fraction: Option<f64>,
n_trials: Option<usize>,
max_c_steps: Option<usize>,
random_seed: Option<u64>,
) -> StatsResult<LtsResult<F>>
where
F: Float
+ std::iter::Sum<F>
+ std::ops::Div<Output = F>
+ std::fmt::Debug
+ std::fmt::Display
+ 'static
+ scirs2_core::numeric::NumAssign
+ scirs2_core::numeric::One
+ scirs2_core::ndarray::ScalarOperand
+ Send
+ Sync,
{
use scirs2_core::random::SliceRandom;
let n = x.nrows();
let p_feat = x.ncols();
if n != y.len() {
return Err(StatsError::DimensionMismatch(format!(
"x has {} rows but y has length {}",
n,
y.len()
)));
}
let trim_frac = trim_fraction.unwrap_or(0.25);
if trim_frac < 0.0 || trim_frac >= 0.5 {
return Err(StatsError::InvalidArgument(
"trim_fraction must be in [0, 0.5)".to_string(),
));
}
let h = ((n as f64) * (1.0 - trim_frac)).floor() as usize + 1;
let h = h.min(n);
let p = p_feat + 1;
if h < p {
return Err(StatsError::InvalidArgument(format!(
"Not enough observations to keep (h={}) for {} parameters",
h, p
)));
}
if n < p + 1 {
return Err(StatsError::InvalidArgument(
"Need at least p+2 observations for LTS".to_string(),
));
}
let n_trials = n_trials.unwrap_or(500.min(n * 10));
let max_c_steps = max_c_steps.unwrap_or(10);
let x_design = add_intercept(x);
use scirs2_core::random::Random;
let mut rng = if let Some(seed) = random_seed {
Random::seed(seed)
} else {
use scirs2_core::random::{Rng, RngExt};
let mut temp = scirs2_core::random::thread_rng();
Random::seed(temp.random())
};
let mut best_sse = F::infinity();
let mut best_inlier_mask = vec![false; n];
let mut best_coeffs = Array1::<F>::zeros(p);
let mut indices: Vec<usize> = (0..n).collect();
for _trial in 0..n_trials {
indices.shuffle(&mut rng);
let subset = &indices[..p];
let sub_x = Array2::from_shape_fn((p, p), |(i, j)| x_design[[subset[i], j]]);
let sub_y = Array1::from_shape_fn(p, |i| y[subset[i]]);
let coeffs = match lstsq(&sub_x.view(), &sub_y.view(), None) {
Ok(res) => res.x,
Err(_) => continue,
};
let (final_coeffs, final_mask, final_sse) =
lts_c_steps(&x_design, y, &coeffs, h, max_c_steps)?;
if final_sse < best_sse {
best_sse = final_sse;
best_inlier_mask = final_mask;
best_coeffs = final_coeffs;
}
}
if best_sse.is_infinite() {
return Err(StatsError::ComputationError(
"LTS could not find a valid fit".to_string(),
));
}
let fitted = x_design.dot(&best_coeffs);
let residuals = y.to_owned() - &fitted;
let inlier_y_sum = best_inlier_mask
.iter()
.enumerate()
.filter(|(_, &m)| m)
.map(|(i, _)| y[i])
.sum::<F>();
let h_f = F::from(h).unwrap_or_else(|| F::one());
let inlier_y_mean = inlier_y_sum / h_f;
let ss_tot: F = best_inlier_mask
.iter()
.enumerate()
.filter(|(_, &m)| m)
.map(|(i, _)| float_powi(y[i] - inlier_y_mean, 2))
.sum();
let ss_res: F = best_inlier_mask
.iter()
.enumerate()
.filter(|(_, &m)| m)
.map(|(i, _)| float_powi(residuals[i], 2))
.sum();
let r_squared = if ss_tot > F::epsilon() {
F::one() - ss_res / ss_tot
} else {
F::one()
};
Ok(LtsResult {
coefficients: best_coeffs,
inlier_mask: best_inlier_mask,
n_support: h,
trimmed_sse: best_sse,
r_squared,
residuals,
})
}
fn lts_c_steps<F>(
x_design: &Array2<F>,
y: &ArrayView1<F>,
initial_coeffs: &Array1<F>,
h: usize,
max_steps: usize,
) -> StatsResult<(Array1<F>, Vec<bool>, F)>
where
F: Float
+ std::iter::Sum<F>
+ std::ops::Div<Output = F>
+ std::fmt::Debug
+ std::fmt::Display
+ 'static
+ scirs2_core::numeric::NumAssign
+ scirs2_core::numeric::One
+ scirs2_core::ndarray::ScalarOperand
+ Send
+ Sync,
{
let n = x_design.nrows();
let mut coeffs = initial_coeffs.clone();
let mut prev_sse = F::infinity();
for _step in 0..max_steps {
let fitted = x_design.dot(&coeffs);
let residuals: Vec<(F, usize)> = (0..n)
.map(|i| (float_powi(y[i] - fitted[i], 2), i))
.collect();
let mut sorted_res = residuals;
sorted_res.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap_or(std::cmp::Ordering::Equal));
let inlier_indices: Vec<usize> = sorted_res[..h].iter().map(|&(_, idx)| idx).collect();
let trimmed_sse: F = sorted_res[..h].iter().map(|&(sq, _)| sq).sum();
let diff = float_abs(prev_sse - trimmed_sse);
if diff < F::epsilon() {
let mut mask = vec![false; n];
for &idx in &inlier_indices {
mask[idx] = true;
}
return Ok((coeffs, mask, trimmed_sse));
}
prev_sse = trimmed_sse;
let sub_x = Array2::from_shape_fn((h, x_design.ncols()), |(i, j)| {
x_design[[inlier_indices[i], j]]
});
let sub_y = Array1::from_shape_fn(h, |i| y[inlier_indices[i]]);
coeffs = match lstsq(&sub_x.view(), &sub_y.view(), None) {
Ok(res) => res.x,
Err(_) => {
let mut mask = vec![false; n];
for &idx in &inlier_indices {
mask[idx] = true;
}
return Ok((coeffs, mask, trimmed_sse));
}
};
}
let fitted = x_design.dot(&coeffs);
let mut residuals: Vec<(F, usize)> = (0..n)
.map(|i| (float_powi(y[i] - fitted[i], 2), i))
.collect();
residuals.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap_or(std::cmp::Ordering::Equal));
let trimmed_sse: F = residuals[..h].iter().map(|&(sq, _)| sq).sum();
let mut mask = vec![false; n];
for &(_, idx) in residuals[..h].iter() {
mask[idx] = true;
}
Ok((coeffs, mask, trimmed_sse))
}
#[allow(clippy::too_many_arguments)]
pub fn bisquare_regression<F>(
x: &ArrayView2<F>,
y: &ArrayView1<F>,
c: Option<F>,
fit_intercept: Option<bool>,
max_iter: Option<usize>,
tol: Option<F>,
) -> StatsResult<RegressionResults<F>>
where
F: Float
+ std::iter::Sum<F>
+ std::ops::Div<Output = F>
+ std::fmt::Debug
+ std::fmt::Display
+ 'static
+ scirs2_core::numeric::NumAssign
+ scirs2_core::numeric::One
+ scirs2_core::ndarray::ScalarOperand
+ Send
+ Sync,
{
if x.nrows() != y.len() {
return Err(StatsError::DimensionMismatch(format!(
"x has {} rows but y has length {}",
x.nrows(),
y.len()
)));
}
let n = x.nrows();
let fit_intercept = fit_intercept.unwrap_or(true);
let c = c.unwrap_or_else(|| F::from(4.685).expect("const"));
let max_iter = max_iter.unwrap_or(50);
let tol = tol.unwrap_or_else(|| F::from(1e-6).expect("const"));
let x_design = if fit_intercept {
add_intercept(x)
} else {
x.to_owned()
};
let p = x_design.ncols();
if n <= p {
return Err(StatsError::InvalidArgument(format!(
"Need more observations ({}) than parameters ({})",
n, p
)));
}
let mut coefficients = fit_ols(&x_design.view(), y)?;
let mut residuals = y.to_owned() - &x_design.dot(&coefficients);
for _ in 0..max_iter {
let sigma = {
let mad = median_abs_deviation_from_zero(&residuals.view());
mad / F::from(0.6745).expect("const")
};
if sigma <= F::epsilon() {
break;
}
let mut weights = Array1::<F>::zeros(n);
for i in 0..n {
let u = residuals[i] / sigma;
let u_over_c = u / c;
let u2 = u_over_c * u_over_c;
if u2 < F::one() {
let one_minus_u2 = F::one() - u2;
weights[i] = one_minus_u2 * one_minus_u2;
}
}
let sqrt_w = weights.mapv(|w| scirs2_core::numeric::Float::sqrt(w));
let sqrt_w_col = sqrt_w.clone().insert_axis(scirs2_core::ndarray::Axis(1));
let xw = &x_design * &sqrt_w_col;
let yw = &y.to_owned() * &sqrt_w;
let new_coeffs = match lstsq(&xw.view(), &yw.view(), None) {
Ok(res) => res.x,
Err(_) => break,
};
let new_residuals = y - &x_design.dot(&new_coeffs);
let coef_change = (&new_coeffs - &coefficients).mapv(float_abs).sum();
let coef_norm = coefficients.mapv(float_abs).sum();
let rel_change = if coef_norm > F::epsilon() {
coef_change / coef_norm
} else {
coef_change
};
coefficients = new_coeffs;
residuals = new_residuals;
if rel_change < tol {
break;
}
}
let fitted_values = x_design.dot(&coefficients);
let df_residuals = n - p;
let y_mean = y.iter().cloned().sum::<F>() / F::from(n).expect("const");
let ss_total = y.iter().map(|&yi| float_powi(yi - y_mean, 2)).sum::<F>();
let ss_res = residuals.iter().map(|&ri| float_powi(ri, 2)).sum::<F>();
let ss_exp = ss_total - ss_res;
let r_squared = if ss_total > F::epsilon() {
ss_exp / ss_total
} else {
F::one()
};
let adj_r_squared = F::one()
- (F::one() - r_squared) * F::from(n - 1).expect("const")
/ F::from(df_residuals).expect("const");
let mse = ss_res / F::from(df_residuals).expect("const");
let residual_std_error = scirs2_core::numeric::Float::sqrt(mse);
let std_errors = match calculate_std_errors(&x_design.view(), &residuals.view(), df_residuals) {
Ok(se) => se,
Err(_) => Array1::<F>::zeros(p),
};
let t_values = calculate_t_values(&coefficients, &std_errors);
let p_values = t_values.mapv(|t| {
let t_abs = scirs2_core::numeric::Float::abs(t);
let df_f = F::from(df_residuals).expect("const");
F::from(2.0).expect("const")
* (F::one() - t_abs / scirs2_core::numeric::Float::sqrt(df_f + t_abs * t_abs))
});
let df_model = p - if fit_intercept { 1 } else { 0 };
let f_statistic = if df_model > 0 && df_residuals > 0 {
(ss_exp / F::from(df_model).expect("const"))
/ (ss_res / F::from(df_residuals).expect("const"))
} else {
F::infinity()
};
let mut conf_intervals = Array2::<F>::zeros((p, 2));
for i in 0..p {
let margin = std_errors[i] * F::from(1.96).expect("const");
conf_intervals[[i, 0]] = coefficients[i] - margin;
conf_intervals[[i, 1]] = coefficients[i] + margin;
}
Ok(RegressionResults {
coefficients,
std_errors,
t_values,
p_values,
conf_intervals,
r_squared,
adj_r_squared,
f_statistic,
f_p_value: F::zero(),
residual_std_error,
df_residuals,
residuals,
fitted_values,
inlier_mask: vec![true; n],
})
}
#[cfg(test)]
mod tests {
use super::*;
use scirs2_core::ndarray::{array, Array2};
#[test]
fn test_lts_basic() {
let x = Array2::from_shape_vec(
(10, 1),
vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0],
)
.expect("shape");
let y = array![2.1, 4.0, 5.9, 8.1, 10.0, 12.0, 14.0, 16.1, 18.0, 20.1];
let result = lts_regression(&x.view(), &y.view(), None, None, None, Some(42))
.expect("LTS should succeed");
assert!((result.coefficients[1] - 2.0).abs() < 0.3);
}
#[test]
fn test_lts_with_outlier() {
let x = Array2::from_shape_vec(
(10, 1),
vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0],
)
.expect("shape");
let y = array![2.1, 4.0, 5.9, 8.1, 10.0, 12.0, 14.0, 16.1, 18.0, 50.0];
let result = lts_regression(&x.view(), &y.view(), None, None, None, Some(42))
.expect("LTS should succeed");
assert!((result.coefficients[1] - 2.0).abs() < 0.5);
assert!(!result.inlier_mask[9]);
}
#[test]
fn test_lts_multiple_outliers() {
let x = Array2::from_shape_vec(
(12, 1),
vec![
1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0,
],
)
.expect("shape");
let y = array![2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 50.0, 60.0, 22.0, 24.0];
let result = lts_regression(&x.view(), &y.view(), Some(0.2), Some(200), None, Some(42))
.expect("LTS should succeed");
assert!((result.coefficients[1] - 2.0).abs() < 0.5);
}
#[test]
fn test_lts_r_squared() {
let x = Array2::from_shape_vec(
(10, 1),
vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0],
)
.expect("shape");
let y = array![2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 20.0];
let result = lts_regression(&x.view(), &y.view(), None, None, None, Some(42))
.expect("LTS should succeed");
let r2: f64 = scirs2_core::numeric::NumCast::from(result.r_squared).expect("cast");
assert!(r2 > 0.95);
}
#[test]
fn test_lts_dimension_mismatch() {
let x = Array2::from_shape_vec((5, 1), vec![1.0, 2.0, 3.0, 4.0, 5.0]).expect("shape");
let y = array![1.0, 2.0, 3.0];
assert!(lts_regression(&x.view(), &y.view(), None, None, None, None).is_err());
}
#[test]
fn test_lts_too_few_observations() {
let x = Array2::from_shape_vec((2, 1), vec![1.0, 2.0]).expect("shape");
let y = array![1.0, 2.0];
assert!(lts_regression(&x.view(), &y.view(), None, None, None, None).is_err());
}
#[test]
fn test_bisquare_basic() {
let x = Array2::from_shape_vec(
(10, 1),
vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0],
)
.expect("shape");
let y = array![2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 20.0];
let result = bisquare_regression(&x.view(), &y.view(), None, None, None, None)
.expect("bisquare should succeed");
assert!((result.coefficients[1] - 2.0).abs() < 0.3);
}
#[test]
fn test_bisquare_with_outlier() {
let x = Array2::from_shape_vec(
(10, 1),
vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0],
)
.expect("shape");
let y = array![2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 50.0];
let result = bisquare_regression(&x.view(), &y.view(), None, None, None, None)
.expect("bisquare should succeed");
assert!((result.coefficients[1] - 2.0).abs() < 1.0);
}
#[test]
fn test_bisquare_r_squared() {
let x = Array2::from_shape_vec(
(10, 1),
vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0],
)
.expect("shape");
let y = array![2.1, 4.0, 5.9, 8.1, 10.0, 12.0, 14.1, 16.0, 18.0, 20.1];
let result = bisquare_regression(&x.view(), &y.view(), None, None, None, None)
.expect("bisquare should succeed");
let r2: f64 = scirs2_core::numeric::NumCast::from(result.r_squared).expect("cast");
assert!(r2 > 0.95);
}
#[test]
fn test_bisquare_custom_c() {
let x = Array2::from_shape_vec(
(10, 1),
vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0],
)
.expect("shape");
let y = array![2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 50.0];
let result = bisquare_regression(&x.view(), &y.view(), Some(2.0), None, None, None)
.expect("bisquare should succeed");
assert!((result.coefficients[1] - 2.0).abs() < 1.0);
}
#[test]
fn test_bisquare_dimension_mismatch() {
let x = Array2::from_shape_vec((5, 1), vec![1.0, 2.0, 3.0, 4.0, 5.0]).expect("shape");
let y = array![1.0, 2.0, 3.0];
assert!(bisquare_regression(&x.view(), &y.view(), None, None, None, None).is_err());
}
#[test]
fn test_bisquare_too_few() {
let x = Array2::from_shape_vec((1, 1), vec![1.0]).expect("shape");
let y = array![1.0];
assert!(bisquare_regression(&x.view(), &y.view(), None, None, None, None).is_err());
}
}