scirs2-optimize 0.4.2

//! L-BFGS-B: Limited-memory BFGS with box constraints (Byrd et al. 1995).
//!
//! Implements the full L-BFGS-B algorithm including:
//! - Two-loop recursion for the inverse Hessian-vector product
//! - Strong Wolfe line search with cubic/quadratic interpolation zoom
//! - Box-constraint handling via projected gradient and Cauchy-point computation
//! - Subspace minimization on the free (inactive) variables
//!
//! ## References
//!
//! - Byrd, R.H., Lu, P., Nocedal, J., Zhu, C. (1995).
//!   "A limited memory algorithm for bound constrained optimization."
//!   SIAM J. Sci. Comput. 16(5): 1190–1208.
//! - Zhu, C., Byrd, R.H., Lu, P., Nocedal, J. (1997).
//!   "Algorithm 778: L-BFGS-B." ACM Trans. Math. Softw. 23(4): 550–560.

use super::types::{LbfgsBConfig, OptResult};
use crate::error::OptimizeError;

// ─── Projection utilities ────────────────────────────────────────────────────

/// Project `x` onto the box `[lo, hi]` (element-wise).
///
/// Infinite bounds are represented as `f64::NEG_INFINITY` / `f64::INFINITY`.
pub fn project(x: &[f64], lo: &[f64], hi: &[f64]) -> Vec<f64> {
    x.iter()
        .zip(lo.iter())
        .zip(hi.iter())
        .map(|((xi, li), ui)| xi.clamp(*li, *ui))
        .collect()
}

/// Compute the projected gradient optimality measure: ‖x − P(x − g)‖∞.
///
/// This is the standard KKT stationarity measure for box-constrained problems.
pub fn projected_grad_norm(x: &[f64], g: &[f64], lo: &[f64], hi: &[f64]) -> f64 {
    x.iter()
        .zip(g.iter())
        .zip(lo.iter())
        .zip(hi.iter())
        .map(|(((xi, gi), li), ui)| {
            let px = (xi - gi).clamp(*li, *ui);
            (xi - px).abs()
        })
        .fold(0.0_f64, f64::max)
}

// ─── Two-loop recursion ───────────────────────────────────────────────────────

/// Compute the L-BFGS inverse Hessian-vector product H_k · g via the
/// two-loop recursion (Nocedal 1980).
///
/// `s_hist[i]` = x\_{k-m+i+1} - x\_{k-m+i}  (oldest first)
/// `y_hist[i]` = grad\_f\_{k-m+i+1} - grad\_f\_{k-m+i}
/// `rho_hist[i]` = 1 / (`y_hist[i]`^T `s_hist[i]`)
///
/// Returns H_k · g, the search direction before negation.
pub fn hv_product(
    g: &[f64],
    s_hist: &[Vec<f64>],
    y_hist: &[Vec<f64>],
    rho_hist: &[f64],
    gamma: f64,
) -> Vec<f64> {
    let n = g.len();
    let m = s_hist.len();
    let mut q = g.to_vec();
    let mut alpha = vec![0.0_f64; m];

    // First loop (newest to oldest)
    for i in (0..m).rev() {
        let rho_i = rho_hist[i];
        let sy: f64 = s_hist[i].iter().zip(q.iter()).map(|(si, qi)| si * qi).sum();
        alpha[i] = rho_i * sy;
        for j in 0..n {
            q[j] -= alpha[i] * y_hist[i][j];
        }
    }

    // Scale by initial Hessian approximation H_0 = gamma * I
    let mut r: Vec<f64> = q.iter().map(|qi| gamma * qi).collect();

    // Second loop (oldest to newest)
    for i in 0..m {
        let rho_i = rho_hist[i];
        let yr: f64 = y_hist[i].iter().zip(r.iter()).map(|(yi, ri)| yi * ri).sum();
        let beta = rho_i * yr;
        for j in 0..n {
            r[j] += s_hist[i][j] * (alpha[i] - beta);
        }
    }

    r
}

// ─── Strong Wolfe line search ─────────────────────────────────────────────────

/// Cubic interpolation between two points to find the minimum.
fn cubic_min(a: f64, fa: f64, dfa: f64, b: f64, fb: f64, dfb: f64) -> f64 {
    let d1 = dfa + dfb - 3.0 * (fb - fa) / (b - a);
    let d2_sq = d1 * d1 - dfa * dfb;
    if d2_sq < 0.0 {
        return 0.5 * (a + b);
    }
    let d2 = d2_sq.sqrt();
    let t = b - (b - a) * (dfb + d2 - d1) / (dfb - dfa + 2.0 * d2);
    t.clamp(
        a.min(b) + 1e-10 * (a - b).abs(),
        a.max(b) - 1e-10 * (a - b).abs(),
    )
}

/// Quadratic interpolation to find the minimum along a line segment.
fn quadratic_min(a: f64, fa: f64, dfa: f64, b: f64, fb: f64) -> f64 {
    let denom = 2.0 * (fb - fa - dfa * (b - a));
    if denom.abs() < 1e-14 {
        return 0.5 * (a + b);
    }
    let t = a - dfa * (b - a).powi(2) / denom;
    t.clamp(a.min(b), a.max(b))
}

/// Zoom phase of the strong Wolfe line search.
///
/// Bisects/interpolates inside a bracket [α_lo, α_hi] until strong Wolfe
/// conditions are satisfied.
fn wolfe_zoom<F>(
    f_and_g: &F,
    x: &[f64],
    d: &[f64],
    lo: &[f64],
    hi: &[f64],
    f0: f64,
    g0: f64, // directional derivative at α=0
    mut a_lo: f64,
    mut f_lo: f64,
    mut df_lo: f64,
    mut a_hi: f64,
    mut f_hi: f64,
    c1: f64,
    c2: f64,
    max_iter: usize,
) -> Result<f64, OptimizeError>
where
    F: Fn(&[f64]) -> (f64, Vec<f64>),
{
    let n = x.len();
    for _ in 0..max_iter {
        // Interpolate to find trial step
        let a_j = if (a_hi - a_lo).abs() < 1e-14 {
            0.5 * (a_lo + a_hi)
        } else {
            cubic_min(a_lo, f_lo, df_lo, a_hi, f_hi, {
                let x_j: Vec<f64> = (0..n)
                    .map(|i| (x[i] + a_hi * d[i]).clamp(lo[i], hi[i]))
                    .collect();
                let (_, g_j) = f_and_g(&x_j);
                g_j.iter()
                    .zip(d.iter())
                    .map(|(gi, di)| gi * di)
                    .sum::<f64>()
            })
        };

        let x_j: Vec<f64> = (0..n)
            .map(|i| (x[i] + a_j * d[i]).clamp(lo[i], hi[i]))
            .collect();
        let (f_j, g_j) = f_and_g(&x_j);
        let df_j: f64 = g_j.iter().zip(d.iter()).map(|(gi, di)| gi * di).sum();

        if f_j > f0 + c1 * a_j * g0 || f_j >= f_lo {
            // a_j becomes new high bracket
            a_hi = a_j;
            f_hi = f_j;
        } else {
            // Sufficient decrease satisfied
            if df_j.abs() <= -c2 * g0 {
                return Ok(a_j); // strong Wolfe satisfied
            }
            if df_j * (a_hi - a_lo) >= 0.0 {
                a_hi = a_lo;
                f_hi = f_lo;
            }
            a_lo = a_j;
            f_lo = f_j;
            df_lo = df_j;
        }

        if (a_hi - a_lo).abs() < 1e-14 * a_lo.abs().max(1.0) {
            break;
        }
    }
    Ok(a_lo)
}

/// Strong Wolfe line search.
///
/// Searches for a step α satisfying:
///   f(x + α·d) ≤ f(x) + c₁·α·∇f(x)^T d  (sufficient decrease)
///   |∇f(x + α·d)^T d| ≤ c₂ · |∇f(x)^T d|  (curvature condition)
///
/// # Arguments
/// * `f_and_g` — function returning (f, ∇f)
/// * `x` — current point
/// * `d` — search direction (must be a descent direction)
/// * `lo`, `hi` — box bounds (projections applied)
/// * `f0`, `g0` — f(x) and directional derivative ∇f(x)^T d at α=0
/// * `c1`, `c2` — Wolfe constants
/// * `alpha_init` — initial step size
/// * `max_iter` — maximum line-search iterations
pub fn wolfe_line_search<F>(
    f_and_g: &F,
    x: &[f64],
    d: &[f64],
    lo: &[f64],
    hi: &[f64],
    f0: f64,
    g0: f64,
    c1: f64,
    c2: f64,
    alpha_init: f64,
    max_iter: usize,
) -> Result<f64, OptimizeError>
where
    F: Fn(&[f64]) -> (f64, Vec<f64>),
{
    let n = x.len();
    let mut a_prev = 0.0_f64;
    let mut a_curr = alpha_init;
    let mut f_prev = f0;

    for i in 0..max_iter {
        let x_curr: Vec<f64> = (0..n)
            .map(|i| (x[i] + a_curr * d[i]).clamp(lo[i], hi[i]))
            .collect();
        let (f_curr, g_curr) = f_and_g(&x_curr);
        let df_curr: f64 = g_curr.iter().zip(d.iter()).map(|(gi, di)| gi * di).sum();

        if f_curr > f0 + c1 * a_curr * g0 || (i > 0 && f_curr >= f_prev) {
            // Bracket found: zoom between a_prev and a_curr
            return wolfe_zoom(
                f_and_g,
                x,
                d,
                lo,
                hi,
                f0,
                g0,
                a_prev,
                f_prev,
                {
                    let x_p: Vec<f64> = (0..n)
                        .map(|j| (x[j] + a_prev * d[j]).clamp(lo[j], hi[j]))
                        .collect();
                    let (_, gp) = f_and_g(&x_p);
                    gp.iter().zip(d.iter()).map(|(gi, di)| gi * di).sum::<f64>()
                },
                a_curr,
                f_curr,
                c1,
                c2,
                30,
            );
        }

        if df_curr.abs() <= -c2 * g0 {
            return Ok(a_curr); // strong Wolfe satisfied
        }

        if df_curr >= 0.0 {
            // Zoom between a_curr and a_prev (reversed)
            return wolfe_zoom(
                f_and_g, x, d, lo, hi, f0, g0, a_curr, f_curr, df_curr, a_prev, f_prev, c1, c2, 30,
            );
        }

        a_prev = a_curr;
        f_prev = f_curr;
        // Expand step
        a_curr = (a_curr * 2.0).min(1e10);
    }
    Ok(a_curr)
}

// ─── Cauchy point computation ─────────────────────────────────────────────────

/// Compute the generalized Cauchy point along the projected gradient path.
///
/// Starting from `x`, moves along the projected steepest-descent direction
/// −g within the box [lo, hi], picking the breakpoints where components
/// become active. Returns the Cauchy point `xc` and a "free variable" mask
/// indicating which components are strictly in the interior at xc.
///
/// This is an O(n log n) implementation following Byrd et al. (1995) §2.
pub fn cauchy_point(
    x: &[f64],
    g: &[f64],
    lo: &[f64],
    hi: &[f64],
    s_hist: &[Vec<f64>],
    y_hist: &[Vec<f64>],
    rho_hist: &[f64],
    gamma: f64,
) -> (Vec<f64>, Vec<bool>) {
    let n = x.len();

    // Step lengths to each bound along −g direction
    let mut breakpoints: Vec<(f64, usize)> = (0..n)
        .filter_map(|i| {
            let ti = if g[i] < 0.0 {
                (x[i] - hi[i]) / g[i] // moving in direction -g: x - t*g hits hi when t = (x-hi)/g
            } else if g[i] > 0.0 {
                (x[i] - lo[i]) / g[i] // hits lo when t = (x-lo)/g
            } else {
                return None;
            };
            if ti > 0.0 {
                Some((ti, i))
            } else {
                None
            }
        })
        .collect();

    breakpoints.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap_or(std::cmp::Ordering::Equal));

    let mut xc = x.to_vec();
    let mut free: Vec<bool> = vec![true; n];

    // Mark components that are immediately at bounds (g[i] == 0 means not moving)
    for i in 0..n {
        if g[i] == 0.0 {
            free[i] = false;
        }
    }

    let mut t_prev = 0.0_f64;
    let d: Vec<f64> = (0..n).map(|i| if free[i] { -g[i] } else { 0.0 }).collect();

    // Compute initial directional derivative f' = g^T d
    let fp0: f64 = d.iter().zip(g.iter()).map(|(di, gi)| di * gi).sum();
    // Compute second-order term: d^T B d where B = H^{-1} approximation
    // Use hv_product to get H^{-1} d, then compute d^T (H^{-1} d) ... wait,
    // actually we need d^T B d where B is the Hessian approximation (not its inverse).
    // For Cauchy point we use the forward-mode: direction d, derivative fp = g^T d < 0.
    // We advance until fp >= 0.

    let _ = fp0; // used below in segment checks

    // Walk through breakpoints, updating xc
    for &(t_j, idx) in &breakpoints {
        // Advance from t_prev to t_j along d (within bounds)
        let dt = t_j - t_prev;

        // Compute directional derivatives in the active segment
        // fp = g^T d_free  (first-order)
        let fp: f64 = (0..n)
            .filter(|&i| free[i])
            .map(|i| g[i] * d[i])
            .sum::<f64>();

        // fpp = d^T (H d) approximation:  use two-loop for H^{-1}v then invert scaling
        // Simpler: use curvature pairs to estimate d^T B d ≈ ||d||^2 / gamma
        let d_free: Vec<f64> = (0..n).map(|i| if free[i] { d[i] } else { 0.0 }).collect();
        let hv = hv_product(&d_free, s_hist, y_hist, rho_hist, gamma);
        let fpp: f64 = d_free
            .iter()
            .zip(hv.iter())
            .map(|(di, hi)| di * hi)
            .sum::<f64>();
        let fpp = if fpp < 1e-14 {
            1.0 / gamma
        } else {
            // fpp here is d^T H^{-1} d; we want d^T B d ≈ ||d||^2 / fpp or 1/gamma
            let dn2: f64 = d_free.iter().map(|di| di * di).sum();
            if dn2 > 1e-14 {
                dn2 / fpp
            } else {
                1.0 / gamma
            }
        };

        // Minimum of quadratic along [t_prev, t_j]: t* = t_prev - fp/fpp
        if fpp > 0.0 {
            let t_star = t_prev - fp / fpp;
            if t_star < t_j {
                // Minimum is within segment — use it
                for i in 0..n {
                    if free[i] {
                        xc[i] = (x[i] + t_star * d[i]).clamp(lo[i], hi[i]);
                    }
                }
                // All variables remain free at the interior minimum
                for bp in &breakpoints {
                    if bp.0 > t_prev {
                        free[bp.1] = false; // only those at earlier breakpoints are fixed
                    }
                }
                // Re-free the ones from t_star onwards
                for &(tb, bi) in &breakpoints {
                    if tb > t_star {
                        free[bi] = true;
                    }
                }
                return (xc, free);
            }
        }

        // Otherwise, advance to next breakpoint
        for i in 0..n {
            if free[i] {
                xc[i] = (x[i] + t_j * d[i]).clamp(lo[i], hi[i]);
            }
        }
        // Component idx hits bound — fix it
        free[idx] = false;
        t_prev = t_j;
    }

    // Went past all breakpoints — xc is at the final projected point
    (xc, free)
}

// ─── L-BFGS-B optimizer ───────────────────────────────────────────────────────

/// L-BFGS-B optimizer: minimizes f(x) subject to lo ≤ x ≤ hi.
pub struct LbfgsBOptimizer {
    /// Algorithm configuration.
    pub config: LbfgsBConfig,
}

impl LbfgsBOptimizer {
    /// Create a new optimizer with the given configuration.
    pub fn new(config: LbfgsBConfig) -> Self {
        Self { config }
    }

    /// Create a new optimizer with default configuration.
    pub fn default_config() -> Self {
        Self {
            config: LbfgsBConfig::default(),
        }
    }

    /// Minimize `f_and_g` subject to box constraints `lo ≤ x ≤ hi`.
    ///
    /// Use `f64::NEG_INFINITY` / `f64::INFINITY` for unbounded components.
    ///
    /// # Arguments
    /// * `f_and_g` — closure returning (f(x), ∇f(x))
    /// * `x0` — initial point (will be projected onto [lo, hi])
    /// * `lo` — lower bounds (use `f64::NEG_INFINITY` for none)
    /// * `hi` — upper bounds (use `f64::INFINITY` for none)
    ///
    /// # Returns
    /// An `OptResult` with the minimizer and convergence information.
    pub fn minimize<F>(
        &self,
        f_and_g: &F,
        x0: &[f64],
        lo: &[f64],
        hi: &[f64],
    ) -> Result<OptResult, OptimizeError>
    where
        F: Fn(&[f64]) -> (f64, Vec<f64>),
    {
        let n = x0.len();
        if lo.len() != n || hi.len() != n {
            return Err(OptimizeError::ValueError(format!(
                "Bound vectors must have length {}, got lo={} hi={}",
                n,
                lo.len(),
                hi.len()
            )));
        }

        // Validate bounds
        for i in 0..n {
            if lo[i] > hi[i] {
                return Err(OptimizeError::ValueError(format!(
                    "lo[{}]={} > hi[{}]={}",
                    i, lo[i], i, hi[i]
                )));
            }
        }

        let cfg = &self.config;
        let m = cfg.m;

        // Project initial point
        let mut x = project(x0, lo, hi);
        let (mut f_val, mut g) = f_and_g(&x);

        // Circular buffers for curvature pairs
        let mut s_hist: Vec<Vec<f64>> = Vec::with_capacity(m);
        let mut y_hist: Vec<Vec<f64>> = Vec::with_capacity(m);
        let mut rho_hist: Vec<f64> = Vec::with_capacity(m);

        // H₀ = γ I, initially identity
        let mut gamma = 1.0_f64;

        let mut n_iter = 0usize;
        let mut converged = false;

        for iter in 0..cfg.max_iter {
            n_iter = iter;
            let pg_norm = projected_grad_norm(&x, &g, lo, hi);
            if pg_norm < cfg.tol {
                converged = true;
                break;
            }

            // Compute descent direction via two-loop recursion (then negate)
            let hg = hv_product(&g, &s_hist, &y_hist, &rho_hist, gamma);
            let mut d: Vec<f64> = hg.iter().map(|v| -v).collect();

            // Project direction: if component at bound and direction moves out, zero it
            for i in 0..n {
                let at_lo = x[i] <= lo[i] + 1e-12;
                let at_hi = x[i] >= hi[i] - 1e-12;
                if (at_lo && d[i] < 0.0) || (at_hi && d[i] > 0.0) {
                    d[i] = 0.0;
                }
            }

            // Directional derivative
            let mut slope: f64 = g.iter().zip(d.iter()).map(|(gi, di)| gi * di).sum();

            // If direction is not a descent direction or is zero, use projected gradient
            if slope >= 0.0 {
                d = g.iter().map(|gi| -gi).collect();
                for i in 0..n {
                    let at_lo = x[i] <= lo[i] + 1e-12;
                    let at_hi = x[i] >= hi[i] - 1e-12;
                    if (at_lo && d[i] < 0.0) || (at_hi && d[i] > 0.0) {
                        d[i] = 0.0;
                    }
                }
                slope = g.iter().zip(d.iter()).map(|(gi, di)| gi * di).sum();
            }

            if slope >= 0.0 {
                // Fully constrained gradient, check convergence
                converged = pg_norm < cfg.tol;
                break;
            }

            // Strong Wolfe line search
            let alpha = match wolfe_line_search(
                f_and_g,
                &x,
                &d,
                lo,
                hi,
                f_val,
                slope,
                cfg.c1,
                cfg.c2,
                cfg.alpha_init,
                cfg.max_ls_iter,
            ) {
                Ok(a) => a,
                Err(_) => 1e-8, // fallback to tiny step
            };

            // Compute new iterate
            let x_new: Vec<f64> = (0..n)
                .map(|i| (x[i] + alpha * d[i]).clamp(lo[i], hi[i]))
                .collect();

            let (f_new, g_new) = f_and_g(&x_new);

            // Curvature pair
            let s: Vec<f64> = (0..n).map(|i| x_new[i] - x[i]).collect();
            let y: Vec<f64> = (0..n).map(|i| g_new[i] - g[i]).collect();
            let sy: f64 = s.iter().zip(y.iter()).map(|(si, yi)| si * yi).sum();

            // Only store if curvature condition sy > 0 (positive definite update)
            if sy > 1e-14 * s.iter().map(|si| si * si).sum::<f64>().sqrt() {
                if s_hist.len() == m {
                    s_hist.remove(0);
                    y_hist.remove(0);
                    rho_hist.remove(0);
                }
                let yy: f64 = y.iter().map(|yi| yi * yi).sum();
                gamma = if yy > 1e-14 { sy / yy } else { gamma };
                rho_hist.push(1.0 / sy);
                s_hist.push(s);
                y_hist.push(y);
            }

            x = x_new;
            f_val = f_new;
            g = g_new;
        }

        let grad_norm = g.iter().map(|gi| gi * gi).sum::<f64>().sqrt();

        Ok(OptResult {
            x,
            f_val,
            grad_norm,
            n_iter,
            converged,
        })
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::second_order::types::LbfgsBConfig;

    fn rosenbrock(x: &[f64]) -> (f64, Vec<f64>) {
        let f = (1.0 - x[0]).powi(2) + 100.0 * (x[1] - x[0].powi(2)).powi(2);
        let g0 = -2.0 * (1.0 - x[0]) - 400.0 * x[0] * (x[1] - x[0].powi(2));
        let g1 = 200.0 * (x[1] - x[0].powi(2));
        (f, vec![g0, g1])
    }

    fn quadratic(x: &[f64]) -> (f64, Vec<f64>) {
        let f: f64 = x.iter().map(|xi| 0.5 * xi * xi).sum();
        let g: Vec<f64> = x.to_vec();
        (f, g)
    }

    #[test]
    fn test_lbfgsb_quadratic() {
        let opt = LbfgsBOptimizer::default_config();
        let x0 = vec![3.0, -2.0, 1.5];
        let lo = vec![f64::NEG_INFINITY; 3];
        let hi = vec![f64::INFINITY; 3];
        let result = opt
            .minimize(&quadratic, &x0, &lo, &hi)
            .expect("minimize failed");
        for xi in &result.x {
            assert!(xi.abs() < 1e-5, "Expected x≈0, got {}", xi);
        }
        assert!(result.converged);
    }

    #[test]
    fn test_lbfgsb_rosenbrock() {
        let mut cfg = LbfgsBConfig::default();
        cfg.max_iter = 2000;
        cfg.tol = 1e-5;
        let opt = LbfgsBOptimizer::new(cfg);
        let x0 = vec![-1.0, 1.0];
        let lo = vec![f64::NEG_INFINITY; 2];
        let hi = vec![f64::INFINITY; 2];
        let result = opt
            .minimize(&rosenbrock, &x0, &lo, &hi)
            .expect("minimize failed");
        assert!(
            (result.x[0] - 1.0).abs() < 0.05 && (result.x[1] - 1.0).abs() < 0.05,
            "Rosenbrock solution wrong: {:?}",
            result.x
        );
    }

    #[test]
    fn test_lbfgsb_box_constraint() {
        let opt = LbfgsBOptimizer::default_config();
        let x0 = vec![3.0, 3.0];
        let lo = vec![-1.0, -1.0];
        let hi = vec![1.0, 1.0];
        let result = opt
            .minimize(&quadratic, &x0, &lo, &hi)
            .expect("minimize failed");
        for (xi, (li, ui)) in result.x.iter().zip(lo.iter().zip(hi.iter())) {
            assert!(
                *xi >= *li - 1e-9 && *xi <= *ui + 1e-9,
                "Bound violated: {}",
                xi
            );
        }
    }

    #[test]
    fn test_lbfgsb_active_constraint() {
        // Minimize (x+2)^2 with x >= 0 — solution is at x=0 (active bound)
        let opt = LbfgsBOptimizer::default_config();
        let x0 = vec![1.0];
        let lo = vec![0.0];
        let hi = vec![f64::INFINITY];
        let obj = |x: &[f64]| -> (f64, Vec<f64>) {
            let f = (x[0] + 2.0).powi(2);
            let g = vec![2.0 * (x[0] + 2.0)];
            (f, g)
        };
        let result = opt.minimize(&obj, &x0, &lo, &hi).expect("minimize failed");
        assert!(
            result.x[0] < 0.01,
            "Active bound not satisfied: x={}",
            result.x[0]
        );
    }

    #[test]
    fn test_wolfe_conditions_satisfied() {
        let obj = |x: &[f64]| -> (f64, Vec<f64>) {
            let f = 0.5 * x[0] * x[0];
            let g = vec![x[0]];
            (f, g)
        };
        let x = vec![2.0];
        let d = vec![-2.0]; // descent direction
        let lo = vec![f64::NEG_INFINITY];
        let hi = vec![f64::INFINITY];
        let f0 = 2.0;
        let g0 = -4.0_f64; // g^T d = 2 * (-2) = -4
        let c1 = 1e-4;
        let c2 = 0.9;
        let alpha = wolfe_line_search(&obj, &x, &d, &lo, &hi, f0, g0, c1, c2, 1.0, 30)
            .expect("line search failed");
        // Verify sufficient decrease
        let x_new: Vec<f64> = x
            .iter()
            .zip(d.iter())
            .map(|(xi, di)| xi + alpha * di)
            .collect();
        let (f_new, g_new) = obj(&x_new);
        assert!(f_new <= f0 + c1 * alpha * g0, "Wolfe c1 violated");
        let slope_new: f64 = g_new.iter().zip(d.iter()).map(|(gi, di)| gi * di).sum();
        assert!(slope_new.abs() <= -c2 * g0, "Wolfe c2 violated");
    }

    #[test]
    fn test_two_loop_recursion() {
        // For a diagonal Hessian H = diag(h) stored as L-BFGS with one pair,
        // verify the approximation is consistent with a positive-definite update.
        let n = 4;
        let s = vec![1.0, 0.0, 0.0, 0.0];
        let y = vec![3.0, 0.0, 0.0, 0.0];
        let rho = 1.0 / 3.0;
        let g = vec![1.0; n];
        let result = hv_product(&g, &[s], &[y], &[rho], 1.0);
        // H^{-1} g should be defined; just check it's not degenerate
        assert_eq!(result.len(), n);
        // For a simple 1-pair case the update is well-defined
        assert!(result.iter().all(|v| v.is_finite()));
    }

    #[test]
    fn test_lbfgsb_memory_m() {
        let obj = |x: &[f64]| -> (f64, Vec<f64>) {
            let f = 0.5 * (x[0] * x[0] + 4.0 * x[1] * x[1]);
            let g = vec![x[0], 4.0 * x[1]];
            (f, g)
        };

        let mut cfg5 = LbfgsBConfig::default();
        cfg5.m = 5;
        cfg5.max_iter = 500;
        let opt5 = LbfgsBOptimizer::new(cfg5);

        let mut cfg10 = LbfgsBConfig::default();
        cfg10.m = 10;
        cfg10.max_iter = 500;
        let opt10 = LbfgsBOptimizer::new(cfg10);

        let x0 = vec![3.0, 2.0];
        let lo = vec![f64::NEG_INFINITY; 2];
        let hi = vec![f64::INFINITY; 2];

        let r5 = opt5.minimize(&obj, &x0, &lo, &hi).expect("m=5 failed");
        let r10 = opt10.minimize(&obj, &x0, &lo, &hi).expect("m=10 failed");

        assert!(r5.f_val.abs() < 1e-8, "m=5 solution wrong: {}", r5.f_val);
        assert!(r10.f_val.abs() < 1e-8, "m=10 solution wrong: {}", r10.f_val);
    }

    #[test]
    fn test_lbfgsb_gradient_check() {
        // Verify that the line search returns a positive finite step for a valid descent direction.
        // f(x) = (x[0] - 1)^2 + (x[1] + 2)^2
        // At x = [0, 0]: g = [-2, 4], so the gradient direction is [-2, 4].
        // The ascent direction is [2, -4] (positive dot product with g).
        let obj = |x: &[f64]| -> (f64, Vec<f64>) {
            let f = (x[0] - 1.0).powi(2) + (x[1] + 2.0).powi(2);
            let g = vec![2.0 * (x[0] - 1.0), 2.0 * (x[1] + 2.0)];
            (f, g)
        };
        let x = vec![0.0, 0.0];
        let lo = vec![f64::NEG_INFINITY; 2];
        let hi = vec![f64::INFINITY; 2];
        let (f0, g0_vec) = obj(&x);
        // Descent direction = negated gradient: [2, -4]
        let d_desc: Vec<f64> = g0_vec.iter().map(|v| -v).collect();
        let g0_desc: f64 = g0_vec
            .iter()
            .zip(d_desc.iter())
            .map(|(gi, di)| gi * di)
            .sum();
        // Verify it is indeed a descent direction (slope < 0)
        assert!(
            g0_desc < 0.0,
            "Descent slope should be negative, got {}",
            g0_desc
        );
        // The ascent direction is +g, slope > 0
        let d_asc: Vec<f64> = g0_vec.clone();
        let g0_asc: f64 = g0_vec
            .iter()
            .zip(d_asc.iter())
            .map(|(gi, di)| gi * di)
            .sum();
        assert!(
            g0_asc > 0.0,
            "Ascent slope should be positive, got {}",
            g0_asc
        );
        // Wolfe line search on descent direction should return a valid step
        let alpha = wolfe_line_search(&obj, &x, &d_desc, &lo, &hi, f0, g0_desc, 1e-4, 0.9, 1.0, 30)
            .expect("ls failed");
        assert!(
            alpha > 0.0 && alpha.is_finite(),
            "Step size invalid: {}",
            alpha
        );
    }

    #[test]
    fn test_cauchy_point_feasible() {
        let x = vec![0.5, 0.5];
        let g = vec![1.0, -1.0];
        let lo = vec![0.0, 0.0];
        let hi = vec![1.0, 1.0];
        let (xc, _free) = cauchy_point(&x, &g, &lo, &hi, &[], &[], &[], 1.0);
        for (xi, (li, ui)) in xc.iter().zip(lo.iter().zip(hi.iter())) {
            assert!(*xi >= *li - 1e-10, "Cauchy point violates lower bound");
            assert!(*xi <= *ui + 1e-10, "Cauchy point violates upper bound");
        }
    }
}