use ferrolearn_core::FerroError;
use ndarray::Array1;
use num_traits::Float;
pub(crate) struct LbfgsOptimizer<F> {
m: usize,
max_iter: usize,
tol: F,
c1: F,
c2: F,
max_ls_iter: usize,
}
impl<F: Float + Send + Sync + 'static> LbfgsOptimizer<F> {
pub(crate) fn new(max_iter: usize, tol: F) -> Self {
Self {
m: 10,
max_iter,
tol,
c1: F::from(1e-4).unwrap(),
c2: F::from(0.9).unwrap(),
max_ls_iter: 20,
}
}
pub(crate) fn minimize<Func>(
&self,
objective: Func,
x0: Array1<F>,
) -> Result<Array1<F>, FerroError>
where
Func: Fn(&Array1<F>) -> (F, Array1<F>),
{
let n = x0.len();
let mut x = x0;
let (mut f, mut g) = objective(&x);
if f.is_nan() || f.is_infinite() {
return Err(FerroError::NumericalInstability {
message: "initial objective value is NaN or infinite".into(),
});
}
let mut s_hist: Vec<Array1<F>> = Vec::with_capacity(self.m);
let mut y_hist: Vec<Array1<F>> = Vec::with_capacity(self.m);
let mut rho_hist: Vec<F> = Vec::with_capacity(self.m);
for iter in 0..self.max_iter {
let g_norm = g.iter().fold(F::zero(), |acc, &v| acc.max(v.abs()));
if g_norm < self.tol {
return Ok(x);
}
let d = self.two_loop_recursion(&g, &s_hist, &y_hist, &rho_hist, n);
let (alpha, f_new, g_new) = self.wolfe_line_search(&objective, &x, &f, &g, &d)?;
let s = d.mapv(|v| v * alpha);
let y = &g_new - &g;
let sy = s.dot(&y);
if sy > F::zero() {
if s_hist.len() == self.m {
s_hist.remove(0);
y_hist.remove(0);
rho_hist.remove(0);
}
rho_hist.push(F::one() / sy);
s_hist.push(s.clone());
y_hist.push(y);
}
x = &x + &s;
f = f_new;
g = g_new;
if f.is_nan() {
return Err(FerroError::NumericalInstability {
message: format!("NaN encountered at iteration {iter}"),
});
}
let _f_change = (f_new - f).abs() / (F::one() + f.abs());
}
let g_norm = g.iter().fold(F::zero(), |acc, &v| acc.max(v.abs()));
if g_norm < F::from(1e-2).unwrap() {
return Ok(x);
}
Err(FerroError::ConvergenceFailure {
iterations: self.max_iter,
message: format!(
"L-BFGS did not converge (gradient norm: {g_norm:.6e})",
g_norm = g_norm.to_f64().unwrap()
),
})
}
fn two_loop_recursion(
&self,
g: &Array1<F>,
s_hist: &[Array1<F>],
y_hist: &[Array1<F>],
rho_hist: &[F],
_n: usize,
) -> Array1<F> {
let k = s_hist.len();
if k == 0 {
return g.mapv(|v| -v);
}
let mut q = g.clone();
let mut alphas = vec![F::zero(); k];
for i in (0..k).rev() {
alphas[i] = rho_hist[i] * s_hist[i].dot(&q);
q = &q - &y_hist[i].mapv(|v| v * alphas[i]);
}
let gamma = {
let sy = s_hist[k - 1].dot(&y_hist[k - 1]);
let yy = y_hist[k - 1].dot(&y_hist[k - 1]);
if yy > F::zero() { sy / yy } else { F::one() }
};
let mut r = q.mapv(|v| v * gamma);
for i in 0..k {
let beta = rho_hist[i] * y_hist[i].dot(&r);
r = &r + &s_hist[i].mapv(|v| v * (alphas[i] - beta));
}
r.mapv(|v| -v)
}
fn wolfe_line_search<Func>(
&self,
objective: &Func,
x: &Array1<F>,
f0: &F,
g0: &Array1<F>,
d: &Array1<F>,
) -> Result<(F, F, Array1<F>), FerroError>
where
Func: Fn(&Array1<F>) -> (F, Array1<F>),
{
let dg0 = g0.dot(d);
if dg0 >= F::zero() {
let d_sd = g0.mapv(|v| -v);
let dg0_sd = g0.dot(&d_sd);
return self.backtracking_line_search(objective, x, f0, &dg0_sd, &d_sd);
}
let mut alpha = F::one();
let mut alpha_lo = F::zero();
let mut alpha_hi = F::from(50.0).unwrap();
let mut f_prev = *f0;
let mut _alpha_prev = F::zero();
for _ls_iter in 0..self.max_ls_iter {
let x_new = x + &d.mapv(|v| v * alpha);
let (f_new, g_new) = objective(&x_new);
if f_new > *f0 + self.c1 * alpha * dg0 || (f_new >= f_prev && _ls_iter > 0) {
alpha_hi = alpha;
alpha = (alpha_lo + alpha_hi) / F::from(2.0).unwrap();
f_prev = f_new;
continue;
}
let dg_new = g_new.dot(d);
if dg_new.abs() <= self.c2 * dg0.abs() {
return Ok((alpha, f_new, g_new));
}
if dg_new >= F::zero() {
alpha_hi = alpha;
} else {
alpha_lo = alpha;
}
f_prev = f_new;
_alpha_prev = alpha;
alpha = (alpha_lo + alpha_hi) / F::from(2.0).unwrap();
}
let x_new = x + &d.mapv(|v| v * alpha);
let (f_new, g_new) = objective(&x_new);
Ok((alpha, f_new, g_new))
}
fn backtracking_line_search<Func>(
&self,
objective: &Func,
x: &Array1<F>,
f0: &F,
dg0: &F,
d: &Array1<F>,
) -> Result<(F, F, Array1<F>), FerroError>
where
Func: Fn(&Array1<F>) -> (F, Array1<F>),
{
let mut alpha = F::one();
let rho = F::from(0.5).unwrap();
for _ in 0..self.max_ls_iter {
let x_new = x + &d.mapv(|v| v * alpha);
let (f_new, g_new) = objective(&x_new);
if f_new <= *f0 + self.c1 * alpha * *dg0 {
return Ok((alpha, f_new, g_new));
}
alpha = alpha * rho;
}
let x_new = x + &d.mapv(|v| v * alpha);
let (f_new, g_new) = objective(&x_new);
Ok((alpha, f_new, g_new))
}
}
#[cfg(test)]
mod tests {
use super::*;
use approx::assert_relative_eq;
#[test]
fn test_lbfgs_quadratic() {
let optimizer = LbfgsOptimizer::<f64>::new(100, 1e-8);
let x0 = Array1::from_vec(vec![5.0, -3.0]);
let result = optimizer
.minimize(
|x| {
let f = 0.5 * x.dot(x);
let g = x.clone();
(f, g)
},
x0,
)
.unwrap();
assert_relative_eq!(result[0], 0.0, epsilon = 1e-6);
assert_relative_eq!(result[1], 0.0, epsilon = 1e-6);
}
#[test]
fn test_lbfgs_rosenbrock() {
let optimizer = LbfgsOptimizer::<f64>::new(1000, 1e-6);
let x0 = Array1::from_vec(vec![-1.0, 1.0]);
let result = optimizer
.minimize(
|x| {
let a = 1.0 - x[0];
let b = x[1] - x[0] * x[0];
let f = a * a + 100.0 * b * b;
let g = Array1::from_vec(vec![-2.0 * a - 400.0 * x[0] * b, 200.0 * b]);
(f, g)
},
x0,
)
.unwrap();
assert_relative_eq!(result[0], 1.0, epsilon = 1e-3);
assert_relative_eq!(result[1], 1.0, epsilon = 1e-3);
}
#[test]
fn test_lbfgs_already_optimal() {
let optimizer = LbfgsOptimizer::<f64>::new(100, 1e-8);
let x0 = Array1::from_vec(vec![0.0, 0.0]);
let result = optimizer
.minimize(
|x| {
let f = 0.5 * x.dot(x);
let g = x.clone();
(f, g)
},
x0,
)
.unwrap();
assert_relative_eq!(result[0], 0.0, epsilon = 1e-8);
assert_relative_eq!(result[1], 0.0, epsilon = 1e-8);
}
}