Skip to main content

scirs2_optimize/second_order/
sr1.rs

1//! SR1 (Symmetric Rank-1) quasi-Newton optimizer with trust-region globalization.
2//!
3//! Implements the SR1 update rule and a limited-memory variant (LSR1) using
4//! a compact representation. Trust-region subproblem is solved by bisection
5//! on the Levenberg–Marquardt regularization parameter λ.
6//!
7//! ## References
8//!
9//! - Byrd, R.H., Khalfan, H.F., Schnabel, R.B. (1994).
10//!   "Analysis of a symmetric rank-one trust region method."
11//!   SIAM J. Optim. 4(1): 1–21.
12//! - Conn, A.R., Gould, N.I.M., Toint, Ph.L. (1991).
13//!   "Convergence of quasi-Newton matrices generated by the symmetric
14//!   rank one update." Math. Program. 50: 177–195.
15
16use super::types::{OptResult, Sr1Config};
17use crate::error::OptimizeError;
18
19// ─── Dense matrix utilities (small n) ────────────────────────────────────────
20
21/// Matrix-vector product y = A x (row-major, n×n dense matrix).
22fn mat_vec(a: &[f64], x: &[f64], n: usize) -> Vec<f64> {
23    (0..n)
24        .map(|i| (0..n).map(|j| a[i * n + j] * x[j]).sum::<f64>())
25        .collect()
26}
27
28/// Symmetric matrix-vector product for dense upper triangle (stored as full row-major).
29fn sym_mat_vec(a: &[f64], x: &[f64], n: usize) -> Vec<f64> {
30    mat_vec(a, x, n)
31}
32
33/// Add outer product s · v^T + v · s^T scaled by `scale` to dense matrix `a` (in-place).
34fn add_outer(a: &mut Vec<f64>, u: &[f64], v: &[f64], n: usize, scale: f64) {
35    for i in 0..n {
36        for j in 0..n {
37            a[i * n + j] += scale * u[i] * v[j];
38        }
39    }
40}
41
42// ─── SR1 update ──────────────────────────────────────────────────────────────
43
44/// Apply one SR1 update to the dense Hessian approximation B (n×n, row-major).
45///
46/// Update rule:
47///   B_{k+1} = B_k + (y - B_k s)(y - B_k s)^T / ((y - B_k s)^T s)
48///
49/// Skip condition (Byrd et al.): skip if
50///   |(y - B s)^T s| < skip_tol · ‖s‖ · ‖y - B s‖
51pub fn sr1_update_dense(b: &mut Vec<f64>, s: &[f64], y: &[f64], n: usize, skip_tol: f64) -> bool {
52    let bs = sym_mat_vec(b, s, n);
53    let r: Vec<f64> = (0..n).map(|i| y[i] - bs[i]).collect(); // r = y - B s
54
55    let rs: f64 = r.iter().zip(s.iter()).map(|(ri, si)| ri * si).sum(); // r^T s
56    let r_norm = r.iter().map(|ri| ri * ri).sum::<f64>().sqrt();
57    let s_norm = s.iter().map(|si| si * si).sum::<f64>().sqrt();
58
59    // Skip if r is essentially zero (y ≈ Bs, no new curvature information) OR
60    // if the denominator is too small relative to ‖s‖·‖r‖ (Byrd et al. skip condition).
61    if r_norm < 1e-14 || rs.abs() < skip_tol * s_norm * r_norm {
62        return false; // skip update
63    }
64
65    let inv_rs = 1.0 / rs;
66    add_outer(b, &r, &r, n, inv_rs);
67    true
68}
69
70// ─── Limited-memory SR1 ───────────────────────────────────────────────────────
71
72/// Limited-memory SR1: compute H^{-1} g approximation from stored (s, y) pairs.
73///
74/// Uses the compact representation of Byrd et al. (1994).  For small m the
75/// n×m matrices Ψ = [s₁ … s_m] and the m×m upper triangular L are computed
76/// explicitly.
77///
78/// For each stored pair:
79///   r_i = s_i − H₀ y_i   (where H₀ = gamma * I)
80///
81/// The update is:
82///   H^{-1} g ≈ H₀ g + Ψ (Ψ^T Y)^{-1} Ψ^T g
83///
84/// where Ψ = S − H₀ Y and the solve is done via Cholesky-like forward/back
85/// substitution on the symmetric matrix Ψ^T Y.
86pub fn lsr1_hv_product(
87    g: &[f64],
88    s_hist: &[Vec<f64>],
89    y_hist: &[Vec<f64>],
90    gamma: f64,
91) -> Vec<f64> {
92    let n = g.len();
93    let m = s_hist.len();
94
95    // H₀ g = gamma * g
96    let mut r: Vec<f64> = g.iter().map(|gi| gamma * gi).collect();
97
98    if m == 0 {
99        return r;
100    }
101
102    // Build Ψ = S - H₀ Y  (each column: s_i - gamma * y_i)
103    let psi: Vec<Vec<f64>> = (0..m)
104        .map(|i| {
105            (0..n)
106                .map(|j| s_hist[i][j] - gamma * y_hist[i][j])
107                .collect::<Vec<f64>>()
108        })
109        .collect();
110
111    // Build M = Ψ^T Y  (m×m matrix)
112    let mut big_m = vec![0.0_f64; m * m];
113    for i in 0..m {
114        for j in 0..m {
115            big_m[i * m + j] = psi[i]
116                .iter()
117                .zip(y_hist[j].iter())
118                .map(|(pi, yj)| pi * yj)
119                .sum();
120        }
121    }
122
123    // Ψ^T g  (m-vector)
124    let psi_g: Vec<f64> = (0..m)
125        .map(|i| psi[i].iter().zip(g.iter()).map(|(pi, gi)| pi * gi).sum())
126        .collect();
127
128    // Solve M v = Ψ^T g via Gaussian elimination (small system)
129    let v = match gaussian_solve(&big_m, &psi_g, m) {
130        Some(x) => x,
131        None => return r, // singular — use H₀ g
132    };
133
134    // r += Ψ v
135    for i in 0..m {
136        for j in 0..n {
137            r[j] += psi[i][j] * v[i];
138        }
139    }
140
141    r
142}
143
144/// Gaussian elimination with partial pivoting to solve A x = b (m×m).
145fn gaussian_solve(a: &[f64], b: &[f64], m: usize) -> Option<Vec<f64>> {
146    let mut aug = vec![0.0_f64; m * (m + 1)];
147    for i in 0..m {
148        for j in 0..m {
149            aug[i * (m + 1) + j] = a[i * m + j];
150        }
151        aug[i * (m + 1) + m] = b[i];
152    }
153
154    for col in 0..m {
155        // Find pivot
156        let mut max_row = col;
157        let mut max_val = aug[col * (m + 1) + col].abs();
158        for row in (col + 1)..m {
159            let v = aug[row * (m + 1) + col].abs();
160            if v > max_val {
161                max_val = v;
162                max_row = row;
163            }
164        }
165        if max_val < 1e-14 {
166            return None; // singular
167        }
168        // Swap rows
169        if max_row != col {
170            for j in 0..=(m) {
171                aug.swap(col * (m + 1) + j, max_row * (m + 1) + j);
172            }
173        }
174        // Eliminate
175        let pivot = aug[col * (m + 1) + col];
176        for row in (col + 1)..m {
177            let factor = aug[row * (m + 1) + col] / pivot;
178            for j in col..=(m) {
179                let val = aug[col * (m + 1) + j] * factor;
180                aug[row * (m + 1) + j] -= val;
181            }
182        }
183    }
184
185    // Back substitution
186    let mut x = vec![0.0_f64; m];
187    for i in (0..m).rev() {
188        let rhs = aug[i * (m + 1) + m];
189        let diag = aug[i * (m + 1) + i];
190        if diag.abs() < 1e-14 {
191            return None;
192        }
193        let sum: f64 = ((i + 1)..m).map(|j| aug[i * (m + 1) + j] * x[j]).sum();
194        x[i] = (rhs - sum) / diag;
195    }
196    Some(x)
197}
198
199// ─── Trust-region subproblem ──────────────────────────────────────────────────
200
201/// Solve the trust-region subproblem: minimise g^T d + 0.5 d^T B d  s.t. ‖d‖ ≤ δ.
202///
203/// Uses bisection on the Levenberg–Marquardt regularization λ:
204///   (B + λI) d = −g,  with λ chosen so that ‖d(λ)‖ ≤ δ.
205///
206/// If B + λI is not positive-definite for the initial λ, λ is increased.
207pub fn trust_region_step(b: &[f64], g: &[f64], delta: f64, n: usize) -> Vec<f64> {
208    // Try Newton step (λ = 0)
209    let b_vec = b.to_vec();
210    if let Some(d) = solve_linear_system(&b_vec, g, n) {
211        let d_neg: Vec<f64> = d.iter().map(|di| -di).collect();
212        let dnorm = d_neg.iter().map(|di| di * di).sum::<f64>().sqrt();
213        if dnorm <= delta {
214            return d_neg; // unconstrained step is within trust region
215        }
216    }
217
218    // Bisect on λ to find the boundary step
219    let mut lam_lo = 0.0_f64;
220    let mut lam_hi = {
221        // Upper bound: λ ≥ ‖g‖/δ − σ_min(B)
222        let g_norm = g.iter().map(|gi| gi * gi).sum::<f64>().sqrt();
223        g_norm / delta
224            + b.iter()
225                .enumerate()
226                .filter(|(idx, _)| idx % (n + 1) == 0)
227                .map(|(_, v)| v.abs())
228                .fold(0.0_f64, f64::max)
229    };
230    lam_hi = lam_hi.max(1.0);
231
232    for _ in 0..50 {
233        let lam_mid = 0.5 * (lam_lo + lam_hi);
234        let mut b_reg = b_vec.clone();
235        for i in 0..n {
236            b_reg[i * n + i] += lam_mid;
237        }
238        if let Some(d) = solve_linear_system(&b_reg, g, n) {
239            let d_neg: Vec<f64> = d.iter().map(|di| -di).collect();
240            let dnorm = d_neg.iter().map(|di| di * di).sum::<f64>().sqrt();
241            if dnorm <= delta {
242                lam_hi = lam_mid;
243            } else {
244                lam_lo = lam_mid;
245            }
246            if (lam_hi - lam_lo).abs() < 1e-12 * (1.0 + lam_mid) {
247                return d_neg;
248            }
249        } else {
250            lam_lo = lam_mid;
251        }
252    }
253
254    // Fall back to steepest descent scaled to trust radius
255    let g_norm = g.iter().map(|gi| gi * gi).sum::<f64>().sqrt();
256    if g_norm < 1e-14 {
257        return vec![0.0; n];
258    }
259    g.iter().map(|gi| -gi * delta / g_norm).collect()
260}
261
262/// Solve A x = b for a dense n×n system (same as gaussian_solve).
263fn solve_linear_system(a: &[f64], b: &[f64], n: usize) -> Option<Vec<f64>> {
264    gaussian_solve(a, b, n)
265}
266
267// ─── SR1 optimizer ───────────────────────────────────────────────────────────
268
269/// SR1 optimizer with limited-memory Hessian and trust-region globalization.
270pub struct Sr1Optimizer {
271    /// Algorithm configuration.
272    pub config: Sr1Config,
273}
274
275impl Sr1Optimizer {
276    /// Create with given configuration.
277    pub fn new(config: Sr1Config) -> Self {
278        Self { config }
279    }
280
281    /// Create with default configuration.
282    pub fn default_config() -> Self {
283        Self {
284            config: Sr1Config::default(),
285        }
286    }
287
288    /// Minimize `f_and_g` (unconstrained) using the SR1 method.
289    ///
290    /// Uses a limited-memory Hessian approximation and a trust-region
291    /// framework for globalization.
292    ///
293    /// # Arguments
294    /// * `f_and_g` — closure returning (f(x), ∇f(x))
295    /// * `x0` — starting point
296    pub fn minimize<F>(&self, f_and_g: &F, x0: &[f64]) -> Result<OptResult, OptimizeError>
297    where
298        F: Fn(&[f64]) -> (f64, Vec<f64>),
299    {
300        let n = x0.len();
301        let cfg = &self.config;
302        let m = cfg.m;
303
304        let mut x = x0.to_vec();
305        let (mut f_val, mut g) = f_and_g(&x);
306
307        // Limited-memory storage
308        let mut s_hist: Vec<Vec<f64>> = Vec::with_capacity(m);
309        let mut y_hist: Vec<Vec<f64>> = Vec::with_capacity(m);
310
311        // Hessian scaling parameter
312        let mut gamma = 1.0_f64;
313
314        // Trust-region radius
315        let mut delta = cfg.delta_init;
316
317        let mut n_iter = 0usize;
318        let mut converged = false;
319
320        for iter in 0..cfg.max_iter {
321            n_iter = iter;
322            let g_norm = g.iter().map(|gi| gi * gi).sum::<f64>().sqrt();
323            if g_norm < cfg.tol {
324                converged = true;
325                break;
326            }
327
328            // Compute descent direction using limited-memory SR1 inverse Hessian
329            let hg = lsr1_hv_product(&g, &s_hist, &y_hist, gamma);
330
331            // Normalize and scale to trust-region radius
332            let hg_norm = hg.iter().map(|v| v * v).sum::<f64>().sqrt();
333            let d: Vec<f64> = if hg_norm > delta {
334                hg.iter().map(|v| -v * delta / hg_norm).collect()
335            } else {
336                hg.iter().map(|v| -v).collect()
337            };
338
339            // Check if proposed step is a descent direction
340            let slope: f64 = g.iter().zip(d.iter()).map(|(gi, di)| gi * di).sum();
341            let d = if slope >= 0.0 {
342                // Fallback to steepest descent
343                let gn = g_norm.max(1e-14);
344                let sc = delta / gn;
345                g.iter().map(|gi| -gi * sc).collect::<Vec<f64>>()
346            } else {
347                d
348            };
349
350            let x_new: Vec<f64> = x.iter().zip(d.iter()).map(|(xi, di)| xi + di).collect();
351            let (f_new, g_new) = f_and_g(&x_new);
352
353            // Actual reduction
354            let actual_red = f_val - f_new;
355            // Predicted reduction: -g^T d - 0.5 d^T B d ≈ -g^T d (simplified)
356            let gd: f64 = g.iter().zip(d.iter()).map(|(gi, di)| gi * di).sum();
357            let predicted_red = -gd; // simplified (ignores second-order term)
358
359            let rho = if predicted_red.abs() < 1e-14 {
360                0.0
361            } else {
362                actual_red / predicted_red
363            };
364
365            // Accept or reject step
366            if rho > cfg.eta {
367                // Update curvature pair
368                let s: Vec<f64> = d.clone();
369                let y: Vec<f64> = (0..n).map(|i| g_new[i] - g[i]).collect();
370
371                // SR1 skip condition
372                let bs = lsr1_hv_product(&s, &s_hist, &y_hist, 1.0 / gamma);
373                let r: Vec<f64> = (0..n).map(|i| y[i] - bs[i]).collect();
374                let rs: f64 = r.iter().zip(s.iter()).map(|(ri, si)| ri * si).sum();
375                let r_norm = r.iter().map(|ri| ri * ri).sum::<f64>().sqrt();
376                let s_norm = s.iter().map(|si| si * si).sum::<f64>().sqrt();
377
378                if rs.abs() >= cfg.skip_tol * s_norm * r_norm {
379                    let sy: f64 = s.iter().zip(y.iter()).map(|(si, yi)| si * yi).sum();
380                    let yy: f64 = y.iter().map(|yi| yi * yi).sum::<f64>();
381                    if yy > 1e-14 {
382                        gamma = sy / yy; // update scaling
383                    }
384                    if s_hist.len() == m {
385                        s_hist.remove(0);
386                        y_hist.remove(0);
387                    }
388                    s_hist.push(s);
389                    y_hist.push(y);
390                }
391
392                x = x_new;
393                f_val = f_new;
394                g = g_new;
395            }
396
397            // Update trust-region radius
398            if rho < 0.25 {
399                delta *= 0.25;
400            } else if rho > 0.75
401                && (d.iter().map(|di| di * di).sum::<f64>().sqrt() - delta).abs() < 1e-10
402            {
403                delta = (2.0 * delta).min(cfg.delta_max);
404            }
405
406            if delta < 1e-12 {
407                break; // trust region too small
408            }
409        }
410
411        let grad_norm = g.iter().map(|gi| gi * gi).sum::<f64>().sqrt();
412        Ok(OptResult {
413            x,
414            f_val,
415            grad_norm,
416            n_iter,
417            converged,
418        })
419    }
420}
421
422#[cfg(test)]
423mod tests {
424    use super::*;
425    use crate::second_order::types::Sr1Config;
426
427    fn quadratic(x: &[f64]) -> (f64, Vec<f64>) {
428        let f: f64 = x
429            .iter()
430            .enumerate()
431            .map(|(i, xi)| 0.5 * (i as f64 + 1.0) * xi * xi)
432            .sum();
433        let g: Vec<f64> = x
434            .iter()
435            .enumerate()
436            .map(|(i, xi)| (i as f64 + 1.0) * xi)
437            .collect();
438        (f, g)
439    }
440
441    #[test]
442    fn test_sr1_update_formula() {
443        let n = 3;
444        let mut b = vec![0.0_f64; n * n];
445        // Initialize to identity
446        for i in 0..n {
447            b[i * n + i] = 1.0;
448        }
449        let s = vec![1.0, 0.0, 0.0];
450        let y = vec![2.0, 0.0, 0.0]; // y - Bs = y - s = [1, 0, 0]
451        let updated = sr1_update_dense(&mut b, &s, &y, n, 1e-8);
452        assert!(updated, "SR1 update should proceed");
453        // After update: B_new[0,0] should be 2.0
454        assert!(
455            (b[0] - 2.0).abs() < 1e-10,
456            "B[0,0] should be 2, got {}",
457            b[0]
458        );
459    }
460
461    #[test]
462    fn test_sr1_skip_bad_curvature() {
463        let n = 2;
464        let mut b = vec![1.0, 0.0, 0.0, 1.0]; // identity
465                                              // y = B s → y - B s = 0 → skip
466        let s = vec![1.0, 0.0];
467        let y = vec![1.0, 0.0]; // Bs = s, so r = 0
468        let updated = sr1_update_dense(&mut b, &s, &y, n, 1e-8);
469        assert!(!updated, "SR1 update should be skipped (zero denominator)");
470    }
471
472    #[test]
473    fn test_sr1_trust_region() {
474        let n = 2;
475        let b = vec![2.0, 0.0, 0.0, 3.0]; // positive definite
476        let g = vec![1.0, 1.0];
477        let delta = 0.5_f64;
478        let d = trust_region_step(&b, &g, delta, n);
479        let d_norm = d.iter().map(|di| di * di).sum::<f64>().sqrt();
480        assert!(
481            d_norm <= delta + 1e-9,
482            "Trust region violated: ‖d‖={} > δ={}",
483            d_norm,
484            delta
485        );
486    }
487
488    #[test]
489    fn test_sr1_quadratic() {
490        let opt = Sr1Optimizer::default_config();
491        let x0 = vec![3.0, -2.0, 1.0];
492        let result = opt.minimize(&quadratic, &x0).expect("SR1 minimize failed");
493        for xi in &result.x {
494            assert!(xi.abs() < 0.01, "Expected x≈0, got {}", xi);
495        }
496    }
497
498    #[test]
499    fn test_sr1_positive_definite_approx() {
500        // After several SR1 updates on a convex problem, check that the
501        // Hessian approximation is at least "positive in the s direction"
502        let n = 2;
503        let mut b = vec![1.0, 0.0, 0.0, 1.0];
504        let pairs = vec![
505            (vec![1.0, 0.0], vec![2.0, 0.0]),
506            (vec![0.0, 1.0], vec![0.0, 3.0]),
507        ];
508        for (s, y) in &pairs {
509            sr1_update_dense(&mut b, s, y, n, 1e-8);
510        }
511        // B s_i should have positive inner product with s_i
512        for (s, _y) in &pairs {
513            let bs = mat_vec(&b, s, n);
514            let sts: f64 = s.iter().zip(bs.iter()).map(|(si, bsi)| si * bsi).sum();
515            assert!(sts > 0.0, "B should be positive in s direction");
516        }
517    }
518
519    #[test]
520    fn test_sr1_symmetric_update() {
521        let n = 3;
522        let mut b = vec![2.0, 1.0, 0.0, 1.0, 3.0, 0.5, 0.0, 0.5, 4.0];
523        let s = vec![0.5, -0.3, 0.1];
524        let y = vec![1.5, 0.2, 0.4];
525        sr1_update_dense(&mut b, &s, &y, n, 1e-8);
526        // Check symmetry: B[i,j] == B[j,i]
527        for i in 0..n {
528            for j in 0..n {
529                assert!(
530                    (b[i * n + j] - b[j * n + i]).abs() < 1e-10,
531                    "B not symmetric at ({},{}) vs ({},{}) : {} vs {}",
532                    i,
533                    j,
534                    j,
535                    i,
536                    b[i * n + j],
537                    b[j * n + i]
538                );
539            }
540        }
541    }
542}