use crate::core::{IntervalType, PredictionResult, RegressionResult};
use crate::solvers::traits::{FittedRegressor, RegressionError, Regressor};
use faer::prelude::Solve;
use faer::{Col, Mat};
#[derive(Debug, Clone)]
pub struct RansacRegressor {
with_intercept: bool,
min_samples: Option<usize>,
residual_threshold: Option<f64>,
max_trials: usize,
stop_probability: f64,
stop_n_inliers: Option<usize>,
random_state: u64,
}
impl Default for RansacRegressor {
fn default() -> Self {
Self {
with_intercept: true,
min_samples: None,
residual_threshold: None,
max_trials: 100,
stop_probability: 0.99,
stop_n_inliers: None,
random_state: 0,
}
}
}
impl RansacRegressor {
pub fn builder() -> RansacRegressorBuilder {
RansacRegressorBuilder::default()
}
}
impl Regressor for RansacRegressor {
type Fitted = FittedRansac;
fn fit(&self, x: &Mat<f64>, y: &Col<f64>) -> Result<Self::Fitted, RegressionError> {
let n = x.nrows();
let p = x.ncols();
if n != y.nrows() {
return Err(RegressionError::DimensionMismatch {
x_rows: n,
y_len: y.nrows(),
});
}
let p_eff = if self.with_intercept { p + 1 } else { p };
let min_samples = self.min_samples.unwrap_or(p_eff);
if min_samples > n {
return Err(RegressionError::InsufficientObservations {
needed: min_samples,
got: n,
});
}
let residual_threshold = self.residual_threshold.unwrap_or_else(|| mad(y));
let mut rng = SplitMix64::new(self.random_state.wrapping_add(0x9E37_79B9_7F4A_7C15));
let mut best_inlier_count: usize = 0;
let mut best_mask: Vec<bool> = vec![false; n];
let mut effective_max_trials = self.max_trials;
let mut buf = vec![0usize; min_samples];
let mut trials_run = 0_usize;
while trials_run < effective_max_trials {
random_combination(n, min_samples, &mut buf, &mut rng);
let beta = match ols_on_subsample(x, y, &buf, self.with_intercept) {
Some(b) => b,
None => {
trials_run += 1;
continue;
}
};
let intercept = if self.with_intercept { beta[0] } else { 0.0 };
let coef_slice = if self.with_intercept {
&beta[1..]
} else {
&beta[..]
};
let mut mask = vec![false; n];
let mut inlier_count = 0_usize;
for i in 0..n {
let mut pred = intercept;
for j in 0..p {
pred += x[(i, j)] * coef_slice[j];
}
let resid = (y[i] - pred).abs();
if resid < residual_threshold {
mask[i] = true;
inlier_count += 1;
}
}
if inlier_count > best_inlier_count {
best_inlier_count = inlier_count;
best_mask = mask;
if best_inlier_count >= min_samples {
let w = best_inlier_count as f64 / n as f64;
let denom = (1.0 - w.powi(min_samples as i32)).ln();
if denom < 0.0 {
let num = (1.0 - self.stop_probability).ln();
let need = (num / denom).ceil() as usize;
effective_max_trials = effective_max_trials.min(need.max(1));
}
}
if let Some(threshold) = self.stop_n_inliers {
if best_inlier_count >= threshold {
break;
}
}
}
trials_run += 1;
}
if best_inlier_count < min_samples {
return Err(RegressionError::ConvergenceFailed {
iterations: self.max_trials,
});
}
let inlier_idx: Vec<usize> = (0..n).filter(|&i| best_mask[i]).collect();
let final_beta = ols_normal_eq(x, y, &inlier_idx, self.with_intercept)
.ok_or(RegressionError::SingularMatrix)?;
let intercept = if self.with_intercept {
Some(final_beta[0])
} else {
None
};
let coef_slice = if self.with_intercept {
&final_beta[1..]
} else {
&final_beta[..]
};
let coefficients = Col::from_fn(p, |i| coef_slice[i]);
let fitted_values = Col::from_fn(n, |i| {
let mut v = intercept.unwrap_or(0.0);
for j in 0..p {
v += x[(i, j)] * coef_slice[j];
}
v
});
let residuals = Col::from_fn(n, |i| y[i] - fitted_values[i]);
let n_params = p + if self.with_intercept { 1 } else { 0 };
let mut result = RegressionResult::empty(p, n);
result.coefficients = coefficients;
result.intercept = intercept;
result.residuals = residuals;
result.fitted_values = fitted_values;
result.rank = n_params;
result.n_parameters = n_params;
result.n_observations = n;
result.r_squared = compute_r_squared(y, &result.residuals);
Ok(FittedRansac {
result,
inlier_mask: best_mask,
n_trials: trials_run,
residual_threshold,
})
}
}
fn compute_r_squared(y: &Col<f64>, residuals: &Col<f64>) -> f64 {
let n = y.nrows();
let y_mean: f64 = (0..n).map(|i| y[i]).sum::<f64>() / n as f64;
let tss: f64 = (0..n).map(|i| (y[i] - y_mean).powi(2)).sum();
let rss: f64 = (0..n).map(|i| residuals[i].powi(2)).sum();
if tss == 0.0 {
if rss == 0.0 {
1.0
} else {
0.0
}
} else {
1.0 - rss / tss
}
}
fn mad(y: &Col<f64>) -> f64 {
let mut v: Vec<f64> = (0..y.nrows()).map(|i| y[i]).collect();
let med = median_in_place(&mut v);
for vi in v.iter_mut() {
*vi = (*vi - med).abs();
}
median_in_place(&mut v)
}
fn median_in_place(v: &mut [f64]) -> f64 {
v.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
let n = v.len();
#[allow(clippy::manual_is_multiple_of)]
if n % 2 == 0 {
0.5 * (v[n / 2 - 1] + v[n / 2])
} else {
v[n / 2]
}
}
fn ols_on_subsample(
x: &Mat<f64>,
y: &Col<f64>,
idx: &[usize],
with_intercept: bool,
) -> Option<Vec<f64>> {
let p = x.ncols();
let p_eff = if with_intercept { p + 1 } else { p };
let k = idx.len();
let xs = Mat::from_fn(k, p_eff, |i, j| {
if with_intercept {
if j == 0 {
1.0
} else {
x[(idx[i], j - 1)]
}
} else {
x[(idx[i], j)]
}
});
let ys = Col::from_fn(k, |i| y[idx[i]]);
if k == p_eff {
let lu = xs.partial_piv_lu();
let beta = lu.solve(&ys);
let mut out = Vec::with_capacity(p_eff);
for i in 0..p_eff {
if !beta[i].is_finite() {
return None;
}
out.push(beta[i]);
}
Some(out)
} else {
ols_normal_eq_from_design(&xs, &ys)
}
}
fn ols_normal_eq(
x: &Mat<f64>,
y: &Col<f64>,
idx: &[usize],
with_intercept: bool,
) -> Option<Vec<f64>> {
let p = x.ncols();
let p_eff = if with_intercept { p + 1 } else { p };
let k = idx.len();
let xs = Mat::from_fn(k, p_eff, |i, j| {
if with_intercept {
if j == 0 {
1.0
} else {
x[(idx[i], j - 1)]
}
} else {
x[(idx[i], j)]
}
});
let ys = Col::from_fn(k, |i| y[idx[i]]);
ols_normal_eq_from_design(&xs, &ys)
}
fn ols_normal_eq_from_design(xs: &Mat<f64>, ys: &Col<f64>) -> Option<Vec<f64>> {
let xtx = xs.transpose() * xs;
let xty = xs.transpose() * ys;
let lu = xtx.partial_piv_lu();
let beta = lu.solve(&xty);
let p_eff = xs.ncols();
let mut out = Vec::with_capacity(p_eff);
for i in 0..p_eff {
if !beta[i].is_finite() {
return None;
}
out.push(beta[i]);
}
Some(out)
}
fn random_combination(n: usize, k: usize, out: &mut [usize], rng: &mut SplitMix64) {
debug_assert_eq!(out.len(), k);
let mut chosen: Vec<usize> = Vec::with_capacity(k);
for j in (n - k)..n {
let t = rng.gen_range_usize(0, j + 1);
if chosen.contains(&t) {
chosen.push(j);
} else {
chosen.push(t);
}
}
chosen.sort_unstable();
out.copy_from_slice(&chosen);
}
#[derive(Debug, Clone)]
struct SplitMix64 {
state: u64,
}
impl SplitMix64 {
fn new(seed: u64) -> Self {
Self { state: seed }
}
fn next_u64(&mut self) -> u64 {
self.state = self.state.wrapping_add(0x9E37_79B9_7F4A_7C15);
let mut z = self.state;
z = (z ^ (z >> 30)).wrapping_mul(0xBF58_476D_1CE4_E5B9);
z = (z ^ (z >> 27)).wrapping_mul(0x94D0_49BB_1331_11EB);
z ^ (z >> 31)
}
fn gen_range_usize(&mut self, lo: usize, hi: usize) -> usize {
let span = (hi - lo) as u64;
if span == 0 {
return lo;
}
let bound = u64::MAX - (u64::MAX % span);
loop {
let r = self.next_u64();
if r < bound {
return lo + (r % span) as usize;
}
}
}
}
#[derive(Debug, Clone)]
pub struct FittedRansac {
result: RegressionResult,
inlier_mask: Vec<bool>,
n_trials: usize,
residual_threshold: f64,
}
impl FittedRansac {
pub fn inlier_mask(&self) -> &[bool] {
&self.inlier_mask
}
pub fn n_inliers(&self) -> usize {
self.inlier_mask.iter().filter(|&&b| b).count()
}
pub fn n_trials(&self) -> usize {
self.n_trials
}
pub fn residual_threshold(&self) -> f64 {
self.residual_threshold
}
}
impl FittedRegressor for FittedRansac {
fn predict(&self, x: &Mat<f64>) -> Col<f64> {
let n = x.nrows();
let p = x.ncols();
let intercept = self.result.intercept.unwrap_or(0.0);
Col::from_fn(n, |i| {
let mut v = intercept;
for j in 0..p {
v += x[(i, j)] * self.result.coefficients[j];
}
v
})
}
fn result(&self) -> &RegressionResult {
&self.result
}
fn predict_with_interval(
&self,
x: &Mat<f64>,
_interval: Option<IntervalType>,
_level: f64,
) -> PredictionResult {
PredictionResult::point_only(self.predict(x))
}
}
#[derive(Debug, Clone, Default)]
pub struct RansacRegressorBuilder {
inner: RansacRegressor,
}
impl RansacRegressorBuilder {
pub fn with_intercept(mut self, include: bool) -> Self {
self.inner.with_intercept = include;
self
}
pub fn min_samples(mut self, value: usize) -> Self {
self.inner.min_samples = Some(value);
self
}
pub fn residual_threshold(mut self, value: f64) -> Self {
self.inner.residual_threshold = Some(value);
self
}
pub fn max_trials(mut self, value: usize) -> Self {
self.inner.max_trials = value;
self
}
pub fn stop_probability(mut self, value: f64) -> Self {
self.inner.stop_probability = value;
self
}
pub fn stop_n_inliers(mut self, value: usize) -> Self {
self.inner.stop_n_inliers = Some(value);
self
}
pub fn random_state(mut self, seed: u64) -> Self {
self.inner.random_state = seed;
self
}
pub fn build(self) -> RansacRegressor {
self.inner
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn ransac_ignores_outliers() {
let n_in = 50;
let n_out = 20;
let n = n_in + n_out;
let mut xs: Vec<f64> = (0..n_in).map(|i| i as f64 * 0.2).collect();
let mut ys: Vec<f64> = xs.iter().map(|xi| 1.0 + 2.0 * xi).collect();
for i in 0..n_out {
xs.push(i as f64 * 0.1);
ys.push(50.0 + i as f64);
}
let x = Mat::from_fn(n, 1, |i, _| xs[i]);
let y = Col::from_fn(n, |i| ys[i]);
let fitted = RansacRegressor::builder()
.residual_threshold(0.5)
.max_trials(200)
.random_state(42)
.build()
.fit(&x, &y)
.expect("RANSAC fit failed");
let intercept = fitted.result().intercept.unwrap();
let slope = fitted.result().coefficients[0];
assert!(
(intercept - 1.0).abs() < 1e-6,
"intercept = {} (expected 1.0)",
intercept
);
assert!(
(slope - 2.0).abs() < 1e-6,
"slope = {} (expected 2.0)",
slope
);
for i in n_in..n {
assert!(!fitted.inlier_mask()[i], "obs {} should be an outlier", i);
}
}
}