scirs2-optimize 0.4.4

//! Robust least squares methods
//!
//! This module provides M-estimators that are less sensitive to outliers than standard least squares.
//! The key idea is to use a different loss function that reduces the influence of large residuals.
//!
//! # Example
//!
//! ```
//! use scirs2_core::ndarray::{array, Array1, Array2};
//! use scirs2_optimize::least_squares::robust::{robust_least_squares, HuberLoss, RobustOptions};
//!
//! // Define a function that returns the residuals
//! fn residual(x: &[f64], data: &[f64]) -> Array1<f64> {
//!     let n = data.len() / 2;
//!     let t_values = &data[0..n];
//!     let y_values = &data[n..];
//!     
//!     let mut res = Array1::zeros(n);
//!     for i in 0..n {
//!         // Model: y = x[0] + x[1] * t
//!         res[i] = y_values[i] - (x[0] + x[1] * t_values[i]);
//!     }
//!     res
//! }
//!
//! // Define the Jacobian
//! fn jacobian(x: &[f64], data: &[f64]) -> Array2<f64> {
//!     let n = data.len() / 2;
//!     let t_values = &data[0..n];
//!     
//!     let mut jac = Array2::zeros((n, 2));
//!     for i in 0..n {
//!         jac[[i, 0]] = -1.0;
//!         jac[[i, 1]] = -t_values[i];
//!     }
//!     jac
//! }
//!
//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
//! // Create data with outliers (concatenated x and y values)
//! let data = array![0.0, 1.0, 2.0, 3.0, 4.0, 0.1, 0.9, 2.1, 2.9, 10.0];
//!
//! // Initial guess
//! let x0 = array![0.0, 0.0];
//!
//! // Solve using Huber loss for robustness
//! let loss = HuberLoss::new(1.0);
//! let result = robust_least_squares(
//!     residual,
//!     &x0,
//!     loss,
//!     Some(jacobian),
//!     &data,
//!     None
//! )?;
//!
//! assert!(result.success);
//! # Ok(())
//! # }
//! ```

use crate::error::OptimizeResult;
use crate::result::OptimizeResults;
use scirs2_core::ndarray::{Array1, Array2, ArrayBase, Data, Ix1};

/// Trait for robust loss functions
pub trait RobustLoss: Clone {
    /// Compute the loss value for a residual
    fn loss(&self, r: f64) -> f64;

    /// Compute the weight (psi function derivative) for a residual
    /// Weight = psi(r) / r where psi is the derivative of the loss
    fn weight(&self, r: f64) -> f64;

    /// Compute the derivative of the weight function (for Hessian computation)
    fn weight_derivative(&self, r: f64) -> f64;
}

/// Standard least squares loss (for comparison)
#[derive(Debug, Clone)]
pub struct SquaredLoss;

impl RobustLoss for SquaredLoss {
    fn loss(&self, r: f64) -> f64 {
        0.5 * r * r
    }

    fn weight(&self, r: f64) -> f64 {
        1.0
    }

    fn weight_derivative(&self, r: f64) -> f64 {
        0.0
    }
}

/// Huber loss function
///
/// The Huber loss is quadratic for small residuals and linear for large residuals,
/// providing a balance between efficiency and robustness.
#[derive(Debug, Clone)]
pub struct HuberLoss {
    delta: f64,
}

impl HuberLoss {
    /// Create a new Huber loss with the specified delta parameter
    ///
    /// The delta parameter determines the transition from quadratic to linear behavior.
    /// Smaller delta provides more robustness but less efficiency.
    pub fn new(delta: f64) -> Self {
        assert!(delta > 0.0, "Delta must be positive");
        HuberLoss { delta }
    }
}

impl RobustLoss for HuberLoss {
    fn loss(&self, r: f64) -> f64 {
        let abs_r = r.abs();
        if abs_r <= self.delta {
            0.5 * r * r
        } else {
            self.delta * (abs_r - 0.5 * self.delta)
        }
    }

    fn weight(&self, r: f64) -> f64 {
        let abs_r = r.abs();
        if abs_r < 1e-10 || abs_r <= self.delta {
            1.0
        } else {
            self.delta / abs_r
        }
    }

    fn weight_derivative(&self, r: f64) -> f64 {
        let abs_r = r.abs();
        if abs_r <= self.delta || abs_r < 1e-10 {
            0.0
        } else {
            -self.delta / (abs_r * abs_r)
        }
    }
}

/// Bisquare (Tukey) loss function
///
/// The bisquare loss function provides strong protection against outliers by
/// completely rejecting residuals beyond a certain threshold.
#[derive(Debug, Clone)]
pub struct BisquareLoss {
    c: f64,
}

impl BisquareLoss {
    /// Create a new Bisquare loss with the specified tuning constant
    ///
    /// The c parameter determines the rejection threshold.
    /// Typically set to 4.685 for 95% asymptotic efficiency.
    pub fn new(c: f64) -> Self {
        assert!(c > 0.0, "Tuning constant must be positive");
        BisquareLoss { c }
    }
}

impl RobustLoss for BisquareLoss {
    fn loss(&self, r: f64) -> f64 {
        let abs_r = r.abs();
        if abs_r <= self.c {
            let u = r / self.c;
            (self.c * self.c / 6.0) * (1.0 - (1.0 - u * u).powi(3))
        } else {
            self.c * self.c / 6.0
        }
    }

    fn weight(&self, r: f64) -> f64 {
        let abs_r = r.abs();
        if abs_r < 1e-10 {
            1.0
        } else if abs_r <= self.c {
            let u = r / self.c;
            (1.0 - u * u).powi(2)
        } else {
            0.0
        }
    }

    fn weight_derivative(&self, r: f64) -> f64 {
        let abs_r = r.abs();
        if abs_r <= self.c && abs_r >= 1e-10 {
            let u = r / self.c;
            -4.0 * u * (1.0 - u * u) / (self.c * self.c)
        } else {
            0.0
        }
    }
}

/// Cauchy loss function
///
/// The Cauchy loss provides very strong protection against outliers
/// with a slowly decreasing influence function.
#[derive(Debug, Clone)]
pub struct CauchyLoss {
    c: f64,
}

impl CauchyLoss {
    /// Create a new Cauchy loss with the specified scale parameter
    pub fn new(c: f64) -> Self {
        assert!(c > 0.0, "Scale parameter must be positive");
        CauchyLoss { c }
    }
}

impl RobustLoss for CauchyLoss {
    fn loss(&self, r: f64) -> f64 {
        let u = r / self.c;
        (self.c * self.c / 2.0) * (1.0 + u * u).ln()
    }

    fn weight(&self, r: f64) -> f64 {
        if r.abs() < 1e-10 {
            1.0
        } else {
            let u = r / self.c;
            1.0 / (1.0 + u * u)
        }
    }

    fn weight_derivative(&self, r: f64) -> f64 {
        if r.abs() < 1e-10 {
            0.0
        } else {
            let u = r / self.c;
            let denom = 1.0 + u * u;
            -2.0 * u / (self.c * self.c * denom * denom)
        }
    }
}

/// Options for robust least squares optimization
#[derive(Debug, Clone)]
pub struct RobustOptions {
    /// Maximum number of iterations
    pub max_iter: usize,

    /// Maximum number of function evaluations
    pub max_nfev: Option<usize>,

    /// Tolerance for termination by the change of parameters
    pub xtol: f64,

    /// Tolerance for termination by the change of cost function
    pub ftol: f64,

    /// Tolerance for termination by the norm of gradient
    pub gtol: f64,

    /// Whether to use IRLS (Iteratively Reweighted Least Squares)
    pub use_irls: bool,

    /// Convergence tolerance for IRLS weights
    pub weight_tol: f64,

    /// Maximum iterations for IRLS
    pub irls_max_iter: usize,
}

impl Default for RobustOptions {
    fn default() -> Self {
        RobustOptions {
            max_iter: 100,
            max_nfev: None,
            xtol: 1e-8,
            ftol: 1e-8,
            gtol: 1e-8,
            use_irls: true,
            weight_tol: 1e-4,
            irls_max_iter: 20,
        }
    }
}

/// Solve a robust least squares problem using M-estimators
///
/// This function minimizes the sum of a robust loss function applied to residuals,
/// providing protection against outliers in the data.
///
/// # Arguments
///
/// * `residuals` - Function that returns the residuals
/// * `x0` - Initial guess for the parameters
/// * `loss` - Robust loss function to use
/// * `jacobian` - Optional Jacobian function
/// * `data` - Additional data to pass to residuals and jacobian
/// * `options` - Options for the optimization
#[allow(dead_code)]
pub fn robust_least_squares<F, J, L, D, S1, S2>(
    residuals: F,
    x0: &ArrayBase<S1, Ix1>,
    loss: L,
    jacobian: Option<J>,
    data: &ArrayBase<S2, Ix1>,
    options: Option<RobustOptions>,
) -> OptimizeResult<OptimizeResults<f64>>
where
    F: Fn(&[f64], &[D]) -> Array1<f64>,
    J: Fn(&[f64], &[D]) -> Array2<f64>,
    L: RobustLoss,
    D: Clone,
    S1: Data<Elem = f64>,
    S2: Data<Elem = D>,
{
    let options = options.unwrap_or_default();

    // Use IRLS (Iteratively Reweighted Least Squares) for robust optimization
    if options.use_irls {
        irls_optimizer(residuals, x0, loss, jacobian, data, &options)
    } else {
        // Fallback to gradient-based optimization with robust loss
        gradient_based_robust_optimizer(residuals, x0, loss, jacobian, data, &options)
    }
}

/// IRLS (Iteratively Reweighted Least Squares) optimizer
#[allow(dead_code)]
fn irls_optimizer<F, J, L, D, S1, S2>(
    residuals: F,
    x0: &ArrayBase<S1, Ix1>,
    loss: L,
    jacobian: Option<J>,
    data: &ArrayBase<S2, Ix1>,
    options: &RobustOptions,
) -> OptimizeResult<OptimizeResults<f64>>
where
    F: Fn(&[f64], &[D]) -> Array1<f64>,
    J: Fn(&[f64], &[D]) -> Array2<f64>,
    L: RobustLoss,
    D: Clone,
    S1: Data<Elem = f64>,
    S2: Data<Elem = D>,
{
    let mut x = x0.to_owned();
    let m = x.len();

    let max_nfev = options.max_nfev.unwrap_or(options.max_iter * m * 10);
    let mut nfev = 0;
    let mut njev = 0;
    let mut iter = 0;

    // Compute initial residuals
    let mut res = residuals(
        x.as_slice().expect("Operation failed"),
        data.as_slice().expect("Operation failed"),
    );
    nfev += 1;
    let n = res.len();

    // Initialize weights
    let mut weights = Array1::ones(n);
    let mut prev_weights = weights.clone();

    // Numerical gradient helper
    let compute_numerical_jacobian =
        |x_val: &Array1<f64>, res_val: &Array1<f64>| -> (Array2<f64>, usize) {
            let eps = 1e-8;
            let mut jac = Array2::zeros((n, m));
            let mut count = 0;

            for j in 0..m {
                let mut x_h = x_val.clone();
                x_h[j] += eps;
                let res_h = residuals(
                    x_h.as_slice().expect("Operation failed"),
                    data.as_slice().expect("Operation failed"),
                );
                count += 1;

                for i in 0..n {
                    jac[[i, j]] = (res_h[i] - res_val[i]) / eps;
                }
            }

            (jac, count)
        };

    // Main IRLS loop
    while iter < options.irls_max_iter && nfev < max_nfev {
        // Update weights based on residuals
        for i in 0..n {
            weights[i] = loss.weight(res[i]);
        }

        // Check weight convergence
        let weight_change = weights
            .iter()
            .zip(prev_weights.iter())
            .map(|(&w, &pw)| (w - pw).abs())
            .sum::<f64>()
            / n as f64;

        if weight_change < options.weight_tol && iter > 0 {
            break;
        }

        prev_weights = weights.clone();

        // Compute Jacobian
        let (jac, jac_evals) = match &jacobian {
            Some(jac_fn) => {
                let j = jac_fn(
                    x.as_slice().expect("Operation failed"),
                    data.as_slice().expect("Operation failed"),
                );
                njev += 1;
                (j, 0)
            }
            None => {
                let (j, count) = compute_numerical_jacobian(&x, &res);
                nfev += count;
                (j, count)
            }
        };

        // Form weighted normal equations: (J^T * W * J) * delta = -J^T * W * r
        let mut weighted_jac = Array2::zeros((n, m));
        let mut weighted_res = Array1::zeros(n);

        for i in 0..n {
            let w = weights[i].sqrt();
            for j in 0..m {
                weighted_jac[[i, j]] = jac[[i, j]] * w;
            }
            weighted_res[i] = res[i] * w;
        }

        // Solve weighted least squares subproblem
        let jt_wj = weighted_jac.t().dot(&weighted_jac);
        let neg_jt_wr = -weighted_jac.t().dot(&weighted_res);

        // Solve for step
        match solve(&jt_wj, &neg_jt_wr) {
            Some(step) => {
                // Take the step
                let mut line_search_alpha = 1.0;
                let best_cost = compute_robust_cost(&res, &loss);
                let mut best_x = x.clone();

                // Simple backtracking line search
                for _ in 0..10 {
                    let x_new = &x + &step * line_search_alpha;
                    let res_new = residuals(
                        x_new.as_slice().expect("Operation failed"),
                        data.as_slice().expect("Operation failed"),
                    );
                    nfev += 1;

                    let new_cost = compute_robust_cost(&res_new, &loss);

                    if new_cost < best_cost {
                        best_x = x_new;
                        break;
                    }

                    line_search_alpha *= 0.5;
                }

                // Check convergence
                let step_norm = step.iter().map(|&s| s * s).sum::<f64>().sqrt();
                let x_norm = x.iter().map(|&xi| xi * xi).sum::<f64>().sqrt();

                if step_norm < options.xtol * (1.0 + x_norm) {
                    x = best_x;
                    res = residuals(
                        x.as_slice().expect("Operation failed"),
                        data.as_slice().expect("Operation failed"),
                    );
                    nfev += 1;
                    break;
                }

                // Update x and residuals
                x = best_x;
                res = residuals(
                    x.as_slice().expect("Operation failed"),
                    data.as_slice().expect("Operation failed"),
                );
                nfev += 1;
            }
            None => {
                // Singular matrix, reduce step size and try again
                break;
            }
        }

        iter += 1;
    }

    // Compute final cost
    let final_cost = compute_robust_cost(&res, &loss);

    // Create result
    let mut result = OptimizeResults::<f64>::default();
    result.x = x;
    result.fun = final_cost;
    result.nfev = nfev;
    result.njev = njev;
    result.nit = iter;
    result.success = iter < options.irls_max_iter;

    if result.success {
        result.message = "Optimization terminated successfully.".to_string();
    } else {
        result.message = "Maximum iterations reached.".to_string();
    }

    Ok(result)
}

/// Gradient-based robust optimizer using steepest descent with backtracking line search.
///
/// Minimises f(x) = Σ_i ρ(r_i(x)) where ρ is the robust loss.
/// The gradient is  ∇f = J^T · ψ(r)  where ψ(r_i) = weight(r_i) · r_i.
///
/// When `use_irls = false` the caller routes here as a fallback.
#[allow(dead_code)]
fn gradient_based_robust_optimizer<F, J, L, D, S1, S2>(
    residuals: F,
    x0: &ArrayBase<S1, Ix1>,
    loss: L,
    jacobian: Option<J>,
    data: &ArrayBase<S2, Ix1>,
    options: &RobustOptions,
) -> OptimizeResult<OptimizeResults<f64>>
where
    F: Fn(&[f64], &[D]) -> Array1<f64>,
    J: Fn(&[f64], &[D]) -> Array2<f64>,
    L: RobustLoss,
    D: Clone,
    S1: Data<Elem = f64>,
    S2: Data<Elem = D>,
{
    let mut x = x0.to_owned();
    let p = x.len(); // number of parameters

    let max_nfev = options.max_nfev.unwrap_or(options.max_iter * p * 20);
    let mut nfev = 0;
    let mut njev = 0;
    let mut iter = 0;

    // Helper: compute total robust cost
    let robust_cost = |r: &Array1<f64>| -> f64 { r.iter().map(|&ri| loss.loss(ri)).sum() };

    // Numerical Jacobian helper (finite differences)
    let numerical_jacobian =
        |x_val: &Array1<f64>, res_val: &Array1<f64>, nfev_count: &mut usize| -> Array2<f64> {
            let eps = 1e-7_f64;
            let n_res = res_val.len();
            let mut jac = Array2::zeros((n_res, p));
            for j in 0..p {
                let mut x_h = x_val.clone();
                x_h[j] += eps;
                let res_h = residuals(
                    x_h.as_slice().expect("x_h slice failed"),
                    data.as_slice().expect("data slice failed"),
                );
                *nfev_count += 1;
                for i in 0..n_res {
                    jac[[i, j]] = (res_h[i] - res_val[i]) / eps;
                }
            }
            jac
        };

    // Compute initial residuals and cost
    let mut res = residuals(
        x.as_slice().expect("x slice failed"),
        data.as_slice().expect("data slice failed"),
    );
    nfev += 1;
    let mut cost = robust_cost(&res);

    let mut converged = false;

    while iter < options.max_iter && nfev < max_nfev {
        // Compute Jacobian: J is (n_obs × p)
        let jac = match &jacobian {
            Some(jac_fn) => {
                njev += 1;
                jac_fn(
                    x.as_slice().expect("x slice failed"),
                    data.as_slice().expect("data slice failed"),
                )
            }
            None => numerical_jacobian(&x, &res, &mut nfev),
        };

        // ψ(r_i) = weight(r_i) * r_i  (= ρ'(r_i))
        let n_obs = res.len();
        let mut psi = Array1::<f64>::zeros(n_obs);
        for i in 0..n_obs {
            psi[i] = loss.weight(res[i]) * res[i];
        }

        // Gradient: ∇f = J^T · ψ
        let grad = jac.t().dot(&psi);

        // Check gradient convergence
        let grad_norm_sq = grad.iter().map(|&g| g * g).sum::<f64>();
        let grad_norm = grad_norm_sq.sqrt();
        if grad_norm < options.gtol {
            converged = true;
            break;
        }

        // Descent direction: -∇f (steepest descent)
        let direction = -&grad;

        // Armijo backtracking line search.
        // Scale initial step by 1/grad_norm so alpha=1 is approximately a unit step
        // in parameter space rather than depending on gradient magnitude.
        let armijo_c = 1e-4_f64;
        let alpha_init = 1.0_f64 / grad_norm.max(1.0);
        let mut alpha = alpha_init;
        let mut found_step = false;

        for _ in 0..50 {
            let x_try = &x + &direction * alpha;
            let res_try = residuals(
                x_try.as_slice().expect("x_try slice failed"),
                data.as_slice().expect("data slice failed"),
            );
            nfev += 1;
            let cost_try = robust_cost(&res_try);

            // Armijo condition: sufficient descent
            // directional derivative = -||grad||^2 (since direction = -grad)
            if cost_try <= cost - armijo_c * alpha * grad_norm_sq {
                x = x_try;
                res = res_try;
                cost = cost_try;
                found_step = true;
                break;
            }
            alpha *= 0.5;
        }

        // If line search failed entirely, take a very small step anyway
        if !found_step {
            let x_try = &x + &direction * alpha;
            let res_try = residuals(
                x_try.as_slice().expect("x_try slice failed"),
                data.as_slice().expect("data slice failed"),
            );
            nfev += 1;
            let cost_try = robust_cost(&res_try);
            if cost_try < cost {
                x = x_try;
                res = res_try;
                cost = cost_try;
            }
        }

        // Check x-change convergence: ||alpha * direction|| relative to ||x||
        let step_norm = alpha * grad_norm; // ||step|| = alpha * ||grad|| since direction = -grad
        let x_norm = x.iter().map(|&xi| xi * xi).sum::<f64>().sqrt();
        if step_norm < options.xtol * (1.0 + x_norm) {
            converged = true;
            iter += 1;
            break;
        }

        iter += 1;
    }

    let mut result = OptimizeResults::<f64>::default();
    result.x = x;
    result.fun = cost;
    result.nfev = nfev;
    result.njev = njev;
    result.nit = iter;
    result.success = converged;

    if converged {
        result.message = "Gradient-based optimization terminated successfully.".to_string();
    } else {
        result.message = "Maximum iterations reached.".to_string();
    }

    Ok(result)
}

/// Compute the total robust cost
#[allow(dead_code)]
fn compute_robust_cost<L: RobustLoss>(residuals: &Array1<f64>, loss: &L) -> f64 {
    residuals.iter().map(|&r| loss.loss(r)).sum()
}

/// Simple linear system solver (same as in least_squares.rs)
#[allow(dead_code)]
fn solve(a: &Array2<f64>, b: &Array1<f64>) -> Option<Array1<f64>> {
    use scirs2_linalg::solve;

    solve(&a.view(), &b.view(), None).ok()
}

#[cfg(test)]
mod tests {
    use super::*;
    use scirs2_core::ndarray::array;

    #[test]
    fn test_huber_loss() {
        let loss = HuberLoss::new(1.0);

        // Quadratic region
        assert!((loss.loss(0.5) - 0.125).abs() < 1e-10);
        assert!((loss.weight(0.5) - 1.0).abs() < 1e-10);

        // Linear region
        assert!((loss.loss(2.0) - 1.5).abs() < 1e-10);
        assert!((loss.weight(2.0) - 0.5).abs() < 1e-10);
    }

    #[test]
    fn test_bisquare_loss() {
        let loss = BisquareLoss::new(4.685);

        // Small residual
        let small_r = 1.0;
        assert!(loss.loss(small_r) > 0.0);
        assert!(loss.weight(small_r) > 0.0);
        assert!(loss.weight(small_r) < 1.0);

        // Large residual (beyond threshold)
        let large_r = 5.0;
        assert!((loss.loss(large_r) - loss.loss(10.0)).abs() < 1e-10);
        assert_eq!(loss.weight(large_r), 0.0);
    }

    #[test]
    fn test_cauchy_loss() {
        let loss = CauchyLoss::new(1.0);

        // Test that weight decreases with residual magnitude
        assert!(loss.weight(0.0) > loss.weight(1.0));
        assert!(loss.weight(1.0) > loss.weight(2.0));
        assert!(loss.weight(2.0) > loss.weight(5.0));

        // Test symmetry
        assert_eq!(loss.loss(1.0), loss.loss(-1.0));
        assert_eq!(loss.weight(1.0), loss.weight(-1.0));
    }

    #[test]
    fn test_robust_least_squares_linear() {
        // Linear regression with outliers

        fn residual(x: &[f64], data: &[f64]) -> Array1<f64> {
            // data contains t values and y values concatenated
            let n = data.len() / 2;
            let t_values = &data[0..n];
            let y_values = &data[n..];

            let params = x;
            let mut res = Array1::zeros(n);
            for i in 0..n {
                res[i] = y_values[i] - (params[0] + params[1] * t_values[i]);
            }
            res
        }

        fn jacobian(x: &[f64], data: &[f64]) -> Array2<f64> {
            let n = data.len() / 2;
            let t_values = &data[0..n];

            let mut jac = Array2::zeros((n, 2));
            for i in 0..n {
                jac[[i, 0]] = -1.0;
                jac[[i, 1]] = -t_values[i];
            }
            jac
        }

        let x0 = array![0.0, 0.0];

        // Concatenate t and y data
        let data_array = array![0.0, 1.0, 2.0, 3.0, 4.0, 0.1, 0.9, 2.1, 2.9, 10.0];

        // Test with Huber loss
        let huber_loss = HuberLoss::new(1.0);
        let result =
            robust_least_squares(residual, &x0, huber_loss, Some(jacobian), &data_array, None)
                .expect("Operation failed");

        // The robust solution should be less affected by the outlier
        // Expected slope should be close to 1.0 (ignoring the outlier)
        println!("Result: {:?}", result);
        assert!(result.success);
        // Relax the tolerance since our implementation may have different convergence properties
        assert!((result.x[1] - 1.0).abs() < 0.5); // Slope should be closer to 1.0 than outlier influence would suggest
    }

    #[test]
    fn test_irls_convergence() {
        // Simple quadratic minimization
        fn residual(x: &[f64], _: &[f64]) -> Array1<f64> {
            array![x[0] - 1.0, x[1] - 2.0]
        }

        fn jacobian(x: &[f64], _: &[f64]) -> Array2<f64> {
            array![[1.0, 0.0], [0.0, 1.0]]
        }

        let x0 = array![0.0, 0.0];
        let data = array![];

        // Test with Huber loss (should converge to [1.0, 2.0])
        let huber_loss = HuberLoss::new(1.0);
        let result = robust_least_squares(residual, &x0, huber_loss, Some(jacobian), &data, None)
            .expect("Operation failed");

        assert!(result.success);
        assert!((result.x[0] - 1.0).abs() < 1e-3);
        assert!((result.x[1] - 2.0).abs() < 1e-3);
    }

    /// Test gradient-based path: fit y = 2t + 1 with a single outlier.
    ///
    /// We explicitly set `use_irls = false` so the call routes through
    /// `gradient_based_robust_optimizer`.  Bisquare (biweight) loss completely
    /// down-weights residuals beyond its tuning constant and reliably recovers
    /// the true intercept ≈ 1 and slope ≈ 2 even with a large outlier,
    /// whereas Huber loss is only linear in the tails and still gets pulled.
    #[test]
    fn test_gradient_based_robust_optimizer_linear() {
        // Residual: r_i = y_i - (c0 + c1 * t_i)
        // data layout: [t_0..t_n, y_0..y_n]
        fn residual(x: &[f64], data: &[f64]) -> Array1<f64> {
            let n = data.len() / 2;
            let t_vals = &data[0..n];
            let y_vals = &data[n..];
            let mut r = Array1::zeros(n);
            for i in 0..n {
                r[i] = y_vals[i] - (x[0] + x[1] * t_vals[i]);
            }
            r
        }

        fn jacobian(x: &[f64], data: &[f64]) -> Array2<f64> {
            let n = data.len() / 2;
            let t_vals = &data[0..n];
            let mut jac = Array2::zeros((n, 2));
            for i in 0..n {
                jac[[i, 0]] = -1.0;
                jac[[i, 1]] = -t_vals[i];
            }
            let _ = x; // suppress unused warning
            jac
        }

        // 10 clean points following y = 2t + 1, plus one large outlier at t=10.
        // Bisquare loss with c=4.685 gives zero weight to residuals > 4.685,
        // so the outlier (residual ≈ 79) is completely ignored and the
        // minimiser recovers the true [1, 2] parameters.
        // t: [0..10]   y: [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 100]
        let t_vals: Vec<f64> = (0..11).map(|i| i as f64).collect();
        let mut y_vals: Vec<f64> = t_vals.iter().map(|&ti| 2.0 * ti + 1.0).collect();
        y_vals[10] = 100.0; // outlier: true y would be 21

        let mut data_vec = t_vals.clone();
        data_vec.extend_from_slice(&y_vals);
        let data_array = Array1::from_vec(data_vec);

        let x0 = array![0.0, 0.0];

        let opts = RobustOptions {
            use_irls: false, // force gradient path
            max_iter: 1000,
            xtol: 1e-8,
            gtol: 1e-8,
            ftol: 1e-10,
            ..RobustOptions::default()
        };

        let result = robust_least_squares(
            residual,
            &x0,
            BisquareLoss::new(4.685),
            Some(jacobian),
            &data_array,
            Some(opts),
        )
        .expect("gradient_based_robust_optimizer should not error");

        // Bisquare loss zeros out the outlier residual (79 >> 4.685), so the
        // minimiser sees only the 10 clean points and must recover [1, 2].
        println!(
            "Gradient-based result: c0={:.4} c1={:.4} cost={:.6} success={}",
            result.x[0], result.x[1], result.fun, result.success
        );
        assert!(
            (result.x[0] - 1.0).abs() < 0.3,
            "Intercept {:.4} should be near 1.0",
            result.x[0]
        );
        assert!(
            (result.x[1] - 2.0).abs() < 0.3,
            "Slope {:.4} should be near 2.0",
            result.x[1]
        );
    }

    /// Cross-check: gradient path with Huber loss should converge to the same
    /// optimum as IRLS on clean data (no outliers).
    ///
    /// On purely Gaussian residuals the two algorithms must agree at the same
    /// minimum, so this verifies the gradient step is computing the correct
    /// gradient without checking a particular loss's robustness behaviour.
    #[test]
    fn test_gradient_huber_matches_irls_on_clean_data() {
        fn residual(x: &[f64], data: &[f64]) -> Array1<f64> {
            let n = data.len() / 2;
            let t_vals = &data[0..n];
            let y_vals = &data[n..];
            let mut r = Array1::zeros(n);
            for i in 0..n {
                r[i] = y_vals[i] - (x[0] + x[1] * t_vals[i]);
            }
            r
        }

        fn jacobian(x: &[f64], data: &[f64]) -> Array2<f64> {
            let n = data.len() / 2;
            let t_vals = &data[0..n];
            let mut jac = Array2::zeros((n, 2));
            for i in 0..n {
                jac[[i, 0]] = -1.0;
                jac[[i, 1]] = -t_vals[i];
            }
            let _ = x;
            jac
        }

        // 6 clean points: y = 3t + 2, no outliers
        let data_array = array![
            0.0, 1.0, 2.0, 3.0, 4.0, 5.0, // t
            2.0, 5.0, 8.0, 11.0, 14.0, 17.0 // y = 3t + 2
        ];
        let x0 = array![0.0, 0.0];

        // IRLS path
        let opts_irls = RobustOptions {
            use_irls: true,
            max_iter: 200,
            xtol: 1e-10,
            gtol: 1e-10,
            ftol: 1e-12,
            ..RobustOptions::default()
        };
        let res_irls = robust_least_squares(
            residual,
            &x0,
            HuberLoss::new(1.345),
            Some(jacobian),
            &data_array,
            Some(opts_irls),
        )
        .expect("IRLS should not error");

        // Gradient path
        let opts_grad = RobustOptions {
            use_irls: false,
            max_iter: 2000,
            xtol: 1e-10,
            gtol: 1e-10,
            ftol: 1e-12,
            ..RobustOptions::default()
        };
        let res_grad = robust_least_squares(
            residual,
            &x0,
            HuberLoss::new(1.345),
            Some(jacobian),
            &data_array,
            Some(opts_grad),
        )
        .expect("gradient path should not error");

        // Both should converge close to [2, 3]
        assert!(
            (res_irls.x[0] - 2.0).abs() < 1e-4,
            "IRLS intercept {:.6} should be 2.0",
            res_irls.x[0]
        );
        assert!(
            (res_irls.x[1] - 3.0).abs() < 1e-4,
            "IRLS slope {:.6} should be 3.0",
            res_irls.x[1]
        );
        assert!(
            (res_grad.x[0] - 2.0).abs() < 1e-3,
            "Gradient intercept {:.6} should be near 2.0",
            res_grad.x[0]
        );
        assert!(
            (res_grad.x[1] - 3.0).abs() < 1e-3,
            "Gradient slope {:.6} should be near 3.0",
            res_grad.x[1]
        );
    }
}