aprender/optim/
mod.rs

1//! Optimization algorithms for gradient-based learning.
2//!
3//! This module provides both stochastic (mini-batch) and batch (deterministic) optimizers
4//! following the unified [`Optimizer`] trait architecture.
5//!
6//! # Available Optimizers
7//!
8//! ## Stochastic (Mini-Batch) Optimizers
9//! - [`SGD`] - Stochastic Gradient Descent with optional momentum
10//! - [`Adam`] - Adaptive Moment Estimation (adaptive learning rates)
11//!
12//! ## Batch (Deterministic) Optimizers
13//! - [`LBFGS`] - Limited-memory BFGS (memory-efficient quasi-Newton)
14//! - [`ConjugateGradient`] - Conjugate Gradient with three beta formulas
15//! - [`DampedNewton`] - Newton's method with automatic damping for stability
16//!
17//! ## Convex Optimization (Phase 2)
18//! - [`FISTA`] - Fast Iterative Shrinkage-Thresholding (proximal gradient)
19//! - [`CoordinateDescent`] - Coordinate-wise optimization for high dimensions
20//! - [`ADMM`] - Alternating Direction Method of Multipliers (distributed ML)
21//!
22//! ## Constrained Optimization (Phase 3)
23//! - [`ProjectedGradientDescent`] - Projection onto convex sets
24//! - [`AugmentedLagrangian`] - Equality-constrained optimization
25//! - [`InteriorPoint`] - Inequality-constrained optimization (log-barrier)
26//!
27//! ## Utility Functions
28//! - [`safe_cholesky_solve`] - Cholesky solver with automatic regularization
29//!
30//! # Stochastic Optimization (Mini-Batch)
31//!
32//! Stochastic optimizers update parameters incrementally using mini-batch gradients.
33//! Use the [`Optimizer::step`] method for parameter updates:
34//!
35//! ```
36//! use aprender::optim::SGD;
37//! use aprender::primitives::Vector;
38//!
39//! // Create optimizer with learning rate 0.01
40//! let mut optimizer = SGD::new(0.01);
41//!
42//! // Initialize parameters and gradients
43//! let mut params = Vector::from_slice(&[1.0, 2.0, 3.0]);
44//! let gradients = Vector::from_slice(&[0.1, 0.2, 0.3]);
45//!
46//! // Update parameters
47//! optimizer.step(&mut params, &gradients);
48//!
49//! // Parameters are updated: params = params - lr * gradients
50//! assert!((params[0] - 0.999).abs() < 1e-6);
51//! ```
52//!
53//! # Batch Optimization (Full Dataset)
54//!
55//! Batch optimizers minimize objective functions using full dataset access.
56//! They use the `minimize` method which returns detailed convergence information:
57//!
58//! ```
59//! use aprender::optim::{LBFGS, ConvergenceStatus, Optimizer};
60//! use aprender::primitives::Vector;
61//!
62//! // Create L-BFGS optimizer: 100 max iterations, 1e-5 tolerance, 10 memory size
63//! let mut optimizer = LBFGS::new(100, 1e-5, 10);
64//!
65//! // Define objective and gradient functions
66//! let objective = |x: &Vector<f32>| (x[0] - 5.0).powi(2) + (x[1] - 3.0).powi(2);
67//! let gradient = |x: &Vector<f32>| {
68//!     Vector::from_slice(&[2.0 * (x[0] - 5.0), 2.0 * (x[1] - 3.0)])
69//! };
70//!
71//! let x0 = Vector::from_slice(&[0.0, 0.0]);
72//! let result = optimizer.minimize(objective, gradient, x0);
73//!
74//! assert_eq!(result.status, ConvergenceStatus::Converged);
75//! assert!((result.solution[0] - 5.0).abs() < 1e-4);
76//! assert!((result.solution[1] - 3.0).abs() < 1e-4);
77//! ```
78//!
79//! # See Also
80//!
81//! - [`examples/batch_optimization.rs`](https://github.com/paiml/aprender/tree/main/examples/batch_optimization.rs) - Comprehensive examples
82//! - Specification: `docs/specifications/comprehensive-optimization-spec.md`
83
84use serde::{Deserialize, Serialize};
85
86use crate::primitives::{Matrix, Vector};
87
88/// Result of an optimization procedure.
89///
90/// Contains the final solution, convergence information, and diagnostic metrics.
91#[derive(Debug, Clone)]
92pub struct OptimizationResult {
93    /// Final solution (optimized parameters)
94    pub solution: Vector<f32>,
95    /// Final objective function value
96    pub objective_value: f32,
97    /// Number of iterations performed
98    pub iterations: usize,
99    /// Convergence status
100    pub status: ConvergenceStatus,
101    /// Final gradient norm (‖∇f(x)‖)
102    pub gradient_norm: f32,
103    /// Constraint violation (0.0 for unconstrained problems)
104    pub constraint_violation: f32,
105    /// Total elapsed time
106    pub elapsed_time: std::time::Duration,
107}
108
109impl OptimizationResult {
110    /// Creates a converged result.
111    #[must_use]
112    pub fn converged(solution: Vector<f32>, iterations: usize) -> Self {
113        Self {
114            solution,
115            objective_value: 0.0,
116            iterations,
117            status: ConvergenceStatus::Converged,
118            gradient_norm: 0.0,
119            constraint_violation: 0.0,
120            elapsed_time: std::time::Duration::ZERO,
121        }
122    }
123
124    /// Creates a max-iterations result.
125    #[must_use]
126    pub fn max_iterations(solution: Vector<f32>) -> Self {
127        Self {
128            solution,
129            objective_value: 0.0,
130            iterations: 0,
131            status: ConvergenceStatus::MaxIterations,
132            gradient_norm: 0.0,
133            constraint_violation: 0.0,
134            elapsed_time: std::time::Duration::ZERO,
135        }
136    }
137}
138
139/// Convergence status of an optimization procedure.
140#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
141pub enum ConvergenceStatus {
142    /// Converged (gradient norm < tolerance)
143    Converged,
144    /// Reached maximum iteration limit
145    MaxIterations,
146    /// Progress stalled (step size too small)
147    Stalled,
148    /// Numerical error (NaN, Inf, etc.)
149    NumericalError,
150    /// Optimization still running
151    Running,
152    /// User-requested termination
153    UserTerminated,
154}
155
156/// Safely solves a linear system Ax = b using Cholesky decomposition with automatic regularization.
157///
158/// When the matrix is not positive definite or near-singular, this function automatically
159/// adds regularization (λI) to make the system solvable. This is essential for numerical
160/// stability in second-order optimization methods like Newton's method.
161///
162/// # Algorithm
163///
164/// 1. Try standard Cholesky solve with A
165/// 2. If it fails, add regularization: (A + λI)
166/// 3. Increase λ progressively (×10) until solve succeeds or max attempts reached
167///
168/// # Arguments
169///
170/// * `A` - Symmetric matrix (should be positive definite, but may not be)
171/// * `b` - Right-hand side vector
172/// * `initial_lambda` - Initial regularization parameter (typical: 1e-8)
173/// * `max_attempts` - Maximum regularization attempts (typical: 10)
174///
175/// # Returns
176///
177/// * `Ok(x)` - Solution vector if successful
178/// * `Err(msg)` - Error message if regularization failed
179///
180/// # Example
181///
182/// ```
183/// use aprender::optim::safe_cholesky_solve;
184/// use aprender::primitives::{Matrix, Vector};
185///
186/// // Slightly ill-conditioned matrix
187/// let A = Matrix::from_vec(2, 2, vec![1.0, 0.0, 0.0, 1e-8]).expect("valid dimensions");
188/// let b = Vector::from_slice(&[1.0, 1.0]);
189///
190/// let x = safe_cholesky_solve(&A, &b, 1e-8, 10).expect("should solve with regularization");
191/// assert_eq!(x.len(), 2);
192/// ```
193///
194/// # Use Cases
195///
196/// - **Damped Newton**: Regularize Hessian when not positive definite
197/// - **Levenberg-Marquardt**: Add damping parameter for stability
198/// - **Trust region**: Solve trust region subproblem with ill-conditioned Hessian
199///
200/// # References
201///
202/// - Nocedal & Wright (2006), *Numerical Optimization*, Chapter 3
203pub fn safe_cholesky_solve(
204    a: &Matrix<f32>,
205    b: &Vector<f32>,
206    initial_lambda: f32,
207    max_attempts: usize,
208) -> Result<Vector<f32>, &'static str> {
209    // First try without regularization
210    if let Ok(x) = a.cholesky_solve(b) {
211        return Ok(x);
212    }
213
214    // Matrix not positive definite, try with regularization
215    solve_with_regularization(a, b, initial_lambda, max_attempts)
216}
217
218/// Solve linear system with Tikhonov regularization (A + λI)x = b.
219fn solve_with_regularization(
220    a: &Matrix<f32>,
221    b: &Vector<f32>,
222    initial_lambda: f32,
223    max_attempts: usize,
224) -> Result<Vector<f32>, &'static str> {
225    let n = a.n_rows();
226    let mut lambda = initial_lambda;
227
228    for _attempt in 0..max_attempts {
229        let a_reg = create_regularized_matrix(a, n, lambda);
230
231        if let Ok(x) = a_reg.cholesky_solve(b) {
232            return Ok(x);
233        }
234
235        lambda *= 10.0;
236        if lambda > 1e6 {
237            return Err(
238                "Cholesky solve failed: matrix too ill-conditioned even with regularization",
239            );
240        }
241    }
242
243    Err("Cholesky solve failed after maximum regularization attempts")
244}
245
246/// Create regularized matrix `A_reg` = A + λI.
247fn create_regularized_matrix(a: &Matrix<f32>, n: usize, lambda: f32) -> Matrix<f32> {
248    let mut a_reg_data = vec![0.0; n * n];
249    for i in 0..n {
250        for j in 0..n {
251            let diagonal_term = if i == j { lambda } else { 0.0 };
252            a_reg_data[i * n + j] = a.get(i, j) + diagonal_term;
253        }
254    }
255    Matrix::from_vec(n, n, a_reg_data).expect("Matrix dimensions should be valid")
256}
257
258/// Line search strategy for determining step size in batch optimization.
259///
260/// Line search methods find an appropriate step size α along a search direction d
261/// by solving the 1D optimization problem:
262///
263/// ```text
264/// minimize f(x + α*d) over α > 0
265/// ```
266///
267/// Different strategies enforce different conditions on the step size.
268pub trait LineSearch {
269    /// Finds a suitable step size along the search direction.
270    ///
271    /// # Arguments
272    ///
273    /// * `f` - Objective function f: ℝⁿ → ℝ
274    /// * `grad` - Gradient function ∇f: ℝⁿ → ℝⁿ
275    /// * `x` - Current point
276    /// * `d` - Search direction (typically descent direction, ∇f(x)·d < 0)
277    ///
278    /// # Returns
279    ///
280    /// Step size α > 0 satisfying the line search conditions
281    fn search<F, G>(&self, f: &F, grad: &G, x: &Vector<f32>, d: &Vector<f32>) -> f32
282    where
283        F: Fn(&Vector<f32>) -> f32,
284        G: Fn(&Vector<f32>) -> Vector<f32>;
285}
286
287/// Backtracking line search with Armijo condition.
288///
289/// Starts with step size α = 1 and repeatedly shrinks it by factor ρ until
290/// the Armijo condition is satisfied:
291///
292/// ```text
293/// f(x + α*d) ≤ f(x) + c₁*α*∇f(x)ᵀd
294/// ```
295///
296/// This ensures sufficient decrease in the objective function.
297///
298/// # Parameters
299///
300/// - **c1**: Armijo constant (typical: 1e-4), controls acceptable decrease
301/// - **rho**: Backtracking factor (typical: 0.5), shrinkage rate for α
302/// - **`max_iter`**: Maximum backtracking iterations (safety limit)
303///
304/// # Example
305///
306/// ```
307/// use aprender::optim::{BacktrackingLineSearch, LineSearch};
308/// use aprender::primitives::Vector;
309///
310/// let line_search = BacktrackingLineSearch::new(1e-4, 0.5, 50);
311///
312/// // Define a simple quadratic function
313/// let f = |x: &Vector<f32>| x[0] * x[0] + x[1] * x[1];
314/// let grad = |x: &Vector<f32>| Vector::from_slice(&[2.0 * x[0], 2.0 * x[1]]);
315///
316/// let x = Vector::from_slice(&[1.0, 1.0]);
317/// let d = Vector::from_slice(&[-2.0, -2.0]); // Descent direction
318///
319/// let alpha = line_search.search(&f, &grad, &x, &d);
320/// assert!(alpha > 0.0);
321/// ```
322#[derive(Debug, Clone)]
323pub struct BacktrackingLineSearch {
324    /// Armijo constant (c₁ ∈ (0, 1), typical: 1e-4)
325    c1: f32,
326    /// Backtracking factor (ρ ∈ (0, 1), typical: 0.5)
327    rho: f32,
328    /// Maximum backtracking iterations
329    max_iter: usize,
330}
331
332impl BacktrackingLineSearch {
333    /// Creates a new backtracking line search.
334    ///
335    /// # Arguments
336    ///
337    /// * `c1` - Armijo constant (typical: 1e-4)
338    /// * `rho` - Backtracking factor (typical: 0.5)
339    /// * `max_iter` - Maximum iterations (typical: 50)
340    ///
341    /// # Example
342    ///
343    /// ```
344    /// use aprender::optim::BacktrackingLineSearch;
345    ///
346    /// let line_search = BacktrackingLineSearch::new(1e-4, 0.5, 50);
347    /// ```
348    #[must_use]
349    pub fn new(c1: f32, rho: f32, max_iter: usize) -> Self {
350        Self { c1, rho, max_iter }
351    }
352}
353
354impl Default for BacktrackingLineSearch {
355    /// Creates a backtracking line search with default parameters.
356    ///
357    /// Defaults: c1=1e-4, rho=0.5, `max_iter=50`
358    fn default() -> Self {
359        Self::new(1e-4, 0.5, 50)
360    }
361}
362
363impl LineSearch for BacktrackingLineSearch {
364    fn search<F, G>(&self, f: &F, grad: &G, x: &Vector<f32>, d: &Vector<f32>) -> f32
365    where
366        F: Fn(&Vector<f32>) -> f32,
367        G: Fn(&Vector<f32>) -> Vector<f32>,
368    {
369        let mut alpha = 1.0;
370        let fx = f(x);
371        let grad_x = grad(x);
372
373        // Compute directional derivative: ∇f(x)ᵀd
374        let mut dir_deriv = 0.0;
375        for i in 0..x.len() {
376            dir_deriv += grad_x[i] * d[i];
377        }
378
379        // Backtracking loop
380        for _ in 0..self.max_iter {
381            // Compute x_new = x + alpha * d
382            let mut x_new = Vector::zeros(x.len());
383            for i in 0..x.len() {
384                x_new[i] = x[i] + alpha * d[i];
385            }
386
387            let fx_new = f(&x_new);
388
389            // Check Armijo condition: f(x + α*d) ≤ f(x) + c₁*α*∇f(x)ᵀd
390            if fx_new <= fx + self.c1 * alpha * dir_deriv {
391                return alpha;
392            }
393
394            // Shrink step size
395            alpha *= self.rho;
396        }
397
398        // Return the last alpha if max iterations reached
399        alpha
400    }
401}
402
403/// Wolfe line search with Armijo and curvature conditions.
404///
405/// Enforces both the Armijo condition (sufficient decrease) and the curvature
406/// condition (sufficient curvature):
407///
408/// ```text
409/// Armijo:    f(x + α*d) ≤ f(x) + c₁*α*∇f(x)ᵀd
410/// Curvature: |∇f(x + α*d)ᵀd| ≤ c₂*|∇f(x)ᵀd|
411/// ```
412///
413/// The curvature condition ensures the step size is not too small by requiring
414/// that the gradient has decreased sufficiently along the search direction.
415///
416/// # Parameters
417///
418/// - **c1**: Armijo constant (typical: 1e-4), c₁ ∈ (0, c₂)
419/// - **c2**: Curvature constant (typical: 0.9), c₂ ∈ (c₁, 1)
420/// - **`max_iter`**: Maximum line search iterations
421///
422/// # Example
423///
424/// ```
425/// use aprender::optim::{WolfeLineSearch, LineSearch};
426/// use aprender::primitives::Vector;
427///
428/// let line_search = WolfeLineSearch::new(1e-4, 0.9, 50);
429///
430/// let f = |x: &Vector<f32>| x[0] * x[0];
431/// let grad = |x: &Vector<f32>| Vector::from_slice(&[2.0 * x[0]]);
432///
433/// let x = Vector::from_slice(&[1.0]);
434/// let d = Vector::from_slice(&[-2.0]);
435///
436/// let alpha = line_search.search(&f, &grad, &x, &d);
437/// assert!(alpha > 0.0);
438/// ```
439#[derive(Debug, Clone)]
440pub struct WolfeLineSearch {
441    /// Armijo constant (c₁ ∈ (0, c₂), typical: 1e-4)
442    c1: f32,
443    /// Curvature constant (c₂ ∈ (c₁, 1), typical: 0.9)
444    c2: f32,
445    /// Maximum line search iterations
446    max_iter: usize,
447}
448
449impl WolfeLineSearch {
450    /// Creates a new Wolfe line search.
451    ///
452    /// # Arguments
453    ///
454    /// * `c1` - Armijo constant (typical: 1e-4)
455    /// * `c2` - Curvature constant (typical: 0.9)
456    /// * `max_iter` - Maximum iterations (typical: 50)
457    ///
458    /// # Panics
459    ///
460    /// Panics if c1 >= c2 or values are outside (0, 1).
461    ///
462    /// # Example
463    ///
464    /// ```
465    /// use aprender::optim::WolfeLineSearch;
466    ///
467    /// let line_search = WolfeLineSearch::new(1e-4, 0.9, 50);
468    /// ```
469    #[must_use]
470    pub fn new(c1: f32, c2: f32, max_iter: usize) -> Self {
471        assert!(
472            c1 < c2 && c1 > 0.0 && c2 < 1.0,
473            "Wolfe conditions require 0 < c1 < c2 < 1"
474        );
475        Self { c1, c2, max_iter }
476    }
477}
478
479impl Default for WolfeLineSearch {
480    /// Creates a Wolfe line search with default parameters.
481    ///
482    /// Defaults: c1=1e-4, c2=0.9, `max_iter=50`
483    fn default() -> Self {
484        Self::new(1e-4, 0.9, 50)
485    }
486}
487
488impl LineSearch for WolfeLineSearch {
489    fn search<F, G>(&self, f: &F, grad: &G, x: &Vector<f32>, d: &Vector<f32>) -> f32
490    where
491        F: Fn(&Vector<f32>) -> f32,
492        G: Fn(&Vector<f32>) -> Vector<f32>,
493    {
494        let fx = f(x);
495        let grad_x = grad(x);
496
497        // Compute directional derivative: ∇f(x)ᵀd
498        let mut dir_deriv = 0.0;
499        for i in 0..x.len() {
500            dir_deriv += grad_x[i] * d[i];
501        }
502
503        // Start with alpha = 1.0
504        let mut alpha = 1.0;
505        let mut alpha_lo = 0.0;
506        let mut alpha_hi = f32::INFINITY;
507
508        for _ in 0..self.max_iter {
509            // Compute x_new = x + alpha * d
510            let mut x_new = Vector::zeros(x.len());
511            for i in 0..x.len() {
512                x_new[i] = x[i] + alpha * d[i];
513            }
514
515            let fx_new = f(&x_new);
516            let grad_new = grad(&x_new);
517
518            // Compute new directional derivative
519            let mut dir_deriv_new = 0.0;
520            for i in 0..x.len() {
521                dir_deriv_new += grad_new[i] * d[i];
522            }
523
524            // Check Armijo condition
525            if fx_new > fx + self.c1 * alpha * dir_deriv {
526                // Armijo fails - alpha too large
527                alpha_hi = alpha;
528                alpha = (alpha_lo + alpha_hi) / 2.0;
529                continue;
530            }
531
532            // Check curvature condition: |∇f(x + α*d)ᵀd| ≤ c₂*|∇f(x)ᵀd|
533            if dir_deriv_new.abs() <= self.c2 * dir_deriv.abs() {
534                // Both conditions satisfied
535                return alpha;
536            }
537
538            // Curvature condition fails
539            if dir_deriv_new > 0.0 {
540                // Gradient sign changed - reduce alpha
541                alpha_hi = alpha;
542            } else {
543                // Gradient still negative - increase alpha
544                alpha_lo = alpha;
545            }
546
547            // Update alpha
548            if alpha_hi.is_finite() {
549                alpha = (alpha_lo + alpha_hi) / 2.0;
550            } else {
551                alpha *= 2.0;
552            }
553        }
554
555        // Return the last alpha if max iterations reached
556        alpha
557    }
558}
559
560/// Limited-memory BFGS (L-BFGS) optimizer.
561///
562/// L-BFGS is a quasi-Newton method that approximates the inverse Hessian using
563/// a limited history of gradient information. It's efficient for large-scale
564/// optimization problems where storing the full Hessian is infeasible.
565///
566/// # Algorithm
567///
568/// 1. Compute gradient `g_k` = ∇`f(x_k)`
569/// 2. Compute search direction `d_k` using two-loop recursion (approximates H^(-1) * `g_k`)
570/// 3. Find step size `α_k` via line search (Wolfe conditions)
571/// 4. Update: x_{k+1} = `x_k` - `α_k` * `d_k`
572/// 5. Store gradient and position differences for next iteration
573///
574/// # Parameters
575///
576/// - **`max_iter`**: Maximum number of iterations
577/// - **tol**: Convergence tolerance (gradient norm)
578/// - **m**: History size (typically 5-20, tradeoff between memory and convergence)
579///
580/// # Example
581///
582/// ```
583/// use aprender::optim::{LBFGS, Optimizer};
584/// use aprender::primitives::Vector;
585///
586/// let mut optimizer = LBFGS::new(100, 1e-5, 10);
587///
588/// // Define Rosenbrock function and its gradient
589/// let f = |x: &Vector<f32>| {
590///     let a = x[0];
591///     let b = x[1];
592///     (1.0 - a).powi(2) + 100.0 * (b - a * a).powi(2)
593/// };
594///
595/// let grad = |x: &Vector<f32>| {
596///     let a = x[0];
597///     let b = x[1];
598///     Vector::from_slice(&[
599///         -2.0 * (1.0 - a) - 400.0 * a * (b - a * a),
600///         200.0 * (b - a * a),
601///     ])
602/// };
603///
604/// let x0 = Vector::from_slice(&[0.0, 0.0]);
605/// let result = optimizer.minimize(f, grad, x0);
606///
607/// // Should converge to (1, 1)
608/// assert_eq!(result.status, aprender::optim::ConvergenceStatus::Converged);
609/// ```
610#[derive(Debug, Clone)]
611pub struct LBFGS {
612    /// Maximum number of iterations
613    max_iter: usize,
614    /// Convergence tolerance (gradient norm)
615    tol: f32,
616    /// History size (number of correction pairs to store)
617    m: usize,
618    /// Line search strategy
619    line_search: WolfeLineSearch,
620    /// Position differences: `s_k` = x_{k+1} - `x_k`
621    s_history: Vec<Vector<f32>>,
622    /// Gradient differences: `y_k` = g_{k+1} - `g_k`
623    y_history: Vec<Vector<f32>>,
624}
625
626impl LBFGS {
627    /// Creates a new L-BFGS optimizer.
628    ///
629    /// # Arguments
630    ///
631    /// * `max_iter` - Maximum number of iterations (typical: 100-1000)
632    /// * `tol` - Convergence tolerance for gradient norm (typical: 1e-5)
633    /// * `m` - History size (typical: 5-20)
634    ///
635    /// # Example
636    ///
637    /// ```
638    /// use aprender::optim::LBFGS;
639    ///
640    /// let optimizer = LBFGS::new(100, 1e-5, 10);
641    /// ```
642    #[must_use]
643    pub fn new(max_iter: usize, tol: f32, m: usize) -> Self {
644        Self {
645            max_iter,
646            tol,
647            m,
648            line_search: WolfeLineSearch::new(1e-4, 0.9, 50),
649            s_history: Vec::with_capacity(m),
650            y_history: Vec::with_capacity(m),
651        }
652    }
653
654    /// Two-loop recursion to compute search direction.
655    ///
656    /// Approximates H^(-1) * grad where H is the Hessian.
657    /// Uses stored history of s (position diff) and y (gradient diff).
658    fn compute_direction(&self, grad: &Vector<f32>) -> Vector<f32> {
659        let n = grad.len();
660        let k = self.s_history.len();
661
662        if k == 0 {
663            // No history: use steepest descent
664            let mut d = Vector::zeros(n);
665            for i in 0..n {
666                d[i] = -grad[i];
667            }
668            return d;
669        }
670
671        // Initialize q = -grad
672        let mut q = Vector::zeros(n);
673        for i in 0..n {
674            q[i] = -grad[i];
675        }
676
677        let mut alpha = vec![0.0; k];
678        let mut rho = vec![0.0; k];
679
680        // First loop: backward pass
681        for i in (0..k).rev() {
682            let s = &self.s_history[i];
683            let y = &self.y_history[i];
684
685            // rho_i = 1 / (y_i^T s_i)
686            let mut y_dot_s = 0.0;
687            for j in 0..n {
688                y_dot_s += y[j] * s[j];
689            }
690            rho[i] = 1.0 / y_dot_s;
691
692            // alpha_i = rho_i * s_i^T * q
693            let mut s_dot_q = 0.0;
694            for j in 0..n {
695                s_dot_q += s[j] * q[j];
696            }
697            alpha[i] = rho[i] * s_dot_q;
698
699            // q = q - alpha_i * y_i
700            for j in 0..n {
701                q[j] -= alpha[i] * y[j];
702            }
703        }
704
705        // Scale by H_0 = (s^T y) / (y^T y) from most recent update
706        let s_last = &self.s_history[k - 1];
707        let y_last = &self.y_history[k - 1];
708
709        let mut s_dot_y = 0.0;
710        let mut y_dot_y = 0.0;
711        for i in 0..n {
712            s_dot_y += s_last[i] * y_last[i];
713            y_dot_y += y_last[i] * y_last[i];
714        }
715        let gamma = s_dot_y / y_dot_y;
716
717        // r = H_0 * q = gamma * q
718        let mut r = Vector::zeros(n);
719        for i in 0..n {
720            r[i] = gamma * q[i];
721        }
722
723        // Second loop: forward pass
724        for i in 0..k {
725            let s = &self.s_history[i];
726            let y = &self.y_history[i];
727
728            // beta = rho_i * y_i^T * r
729            let mut y_dot_r = 0.0;
730            for j in 0..n {
731                y_dot_r += y[j] * r[j];
732            }
733            let beta = rho[i] * y_dot_r;
734
735            // r = r + s_i * (alpha_i - beta)
736            for j in 0..n {
737                r[j] += s[j] * (alpha[i] - beta);
738            }
739        }
740
741        r
742    }
743
744    /// Computes the L2 norm of a vector.
745    fn norm(v: &Vector<f32>) -> f32 {
746        let mut sum = 0.0;
747        for i in 0..v.len() {
748            sum += v[i] * v[i];
749        }
750        sum.sqrt()
751    }
752}
753
754impl Optimizer for LBFGS {
755    fn step(&mut self, _params: &mut Vector<f32>, _gradients: &Vector<f32>) {
756        unimplemented!(
757            "L-BFGS does not support stochastic updates (step). Use minimize() for batch optimization."
758        )
759    }
760
761    fn minimize<F, G>(&mut self, objective: F, gradient: G, x0: Vector<f32>) -> OptimizationResult
762    where
763        F: Fn(&Vector<f32>) -> f32,
764        G: Fn(&Vector<f32>) -> Vector<f32>,
765    {
766        let start_time = std::time::Instant::now();
767        let n = x0.len();
768
769        // Clear history from previous runs
770        self.s_history.clear();
771        self.y_history.clear();
772
773        let mut x = x0;
774        let mut fx = objective(&x);
775        let mut grad = gradient(&x);
776        let mut grad_norm = Self::norm(&grad);
777
778        for iter in 0..self.max_iter {
779            // Check convergence
780            if grad_norm < self.tol {
781                return OptimizationResult {
782                    solution: x,
783                    objective_value: fx,
784                    iterations: iter,
785                    status: ConvergenceStatus::Converged,
786                    gradient_norm: grad_norm,
787                    constraint_violation: 0.0,
788                    elapsed_time: start_time.elapsed(),
789                };
790            }
791
792            // Compute search direction
793            let d = self.compute_direction(&grad);
794
795            // Line search
796            let alpha = self.line_search.search(&objective, &gradient, &x, &d);
797
798            // Check for stalled progress
799            if alpha < 1e-12 {
800                return OptimizationResult {
801                    solution: x,
802                    objective_value: fx,
803                    iterations: iter,
804                    status: ConvergenceStatus::Stalled,
805                    gradient_norm: grad_norm,
806                    constraint_violation: 0.0,
807                    elapsed_time: start_time.elapsed(),
808                };
809            }
810
811            // Update position: x_new = x + alpha * d
812            let mut x_new = Vector::zeros(n);
813            for i in 0..n {
814                x_new[i] = x[i] + alpha * d[i];
815            }
816
817            // Compute new objective and gradient
818            let fx_new = objective(&x_new);
819            let grad_new = gradient(&x_new);
820
821            // Check for numerical errors
822            if fx_new.is_nan() || fx_new.is_infinite() {
823                return OptimizationResult {
824                    solution: x,
825                    objective_value: fx,
826                    iterations: iter,
827                    status: ConvergenceStatus::NumericalError,
828                    gradient_norm: grad_norm,
829                    constraint_violation: 0.0,
830                    elapsed_time: start_time.elapsed(),
831                };
832            }
833
834            // Compute s_k = x_new - x and y_k = grad_new - grad
835            let mut s_k = Vector::zeros(n);
836            let mut y_k = Vector::zeros(n);
837            for i in 0..n {
838                s_k[i] = x_new[i] - x[i];
839                y_k[i] = grad_new[i] - grad[i];
840            }
841
842            // Check curvature condition: y^T s > 0
843            let mut y_dot_s = 0.0;
844            for i in 0..n {
845                y_dot_s += y_k[i] * s_k[i];
846            }
847
848            if y_dot_s > 1e-10 {
849                // Store in history
850                if self.s_history.len() >= self.m {
851                    self.s_history.remove(0);
852                    self.y_history.remove(0);
853                }
854                self.s_history.push(s_k);
855                self.y_history.push(y_k);
856            }
857
858            // Update for next iteration
859            x = x_new;
860            fx = fx_new;
861            grad = grad_new;
862            grad_norm = Self::norm(&grad);
863        }
864
865        // Max iterations reached
866        OptimizationResult {
867            solution: x,
868            objective_value: fx,
869            iterations: self.max_iter,
870            status: ConvergenceStatus::MaxIterations,
871            gradient_norm: grad_norm,
872            constraint_violation: 0.0,
873            elapsed_time: start_time.elapsed(),
874        }
875    }
876
877    fn reset(&mut self) {
878        self.s_history.clear();
879        self.y_history.clear();
880    }
881}
882
883/// Beta computation formula for Conjugate Gradient.
884///
885/// Different formulas provide different convergence properties and
886/// numerical stability characteristics.
887#[derive(Debug, Clone, Copy, PartialEq, Eq)]
888pub enum CGBetaFormula {
889    /// Fletcher-Reeves: β = (g_{k+1}^T g_{k+1}) / (`g_k^T` `g_k`)
890    ///
891    /// Most stable but can be slow on non-quadratic problems.
892    FletcherReeves,
893    /// Polak-Ribière: β = g_{k+1}^T (g_{k+1} - `g_k`) / (`g_k^T` `g_k`)
894    ///
895    /// Better performance than FR, includes automatic restart (β < 0).
896    PolakRibiere,
897    /// Hestenes-Stiefel: β = g_{k+1}^T (g_{k+1} - `g_k`) / (`d_k^T` (g_{k+1} - `g_k`))
898    ///
899    /// Similar to PR but with different denominator, can be more robust.
900    HestenesStiefel,
901}
902
903/// Nonlinear Conjugate Gradient (CG) optimizer.
904///
905/// CG is an iterative method for solving optimization problems that uses
906/// conjugate directions rather than steepest descent. It's particularly
907/// effective for quadratic problems but extends to general nonlinear optimization.
908///
909/// # Algorithm
910///
911/// 1. Initialize with steepest descent: `d_0` = -∇`f(x_0)`
912/// 2. Line search: find `α_k` minimizing `f(x_k` + `α_k` `d_k`)
913/// 3. Update: x_{k+1} = `x_k` + `α_k` `d_k`
914/// 4. Compute β_{k+1} using chosen formula (FR, PR, or HS)
915/// 5. Update direction: d_{k+1} = -∇f(x_{k+1}) + β_{k+1} `d_k`
916/// 6. Restart if β < 0 or every n iterations
917///
918/// # Parameters
919///
920/// - **`max_iter`**: Maximum number of iterations
921/// - **tol**: Convergence tolerance (gradient norm)
922/// - **`beta_formula`**: Method for computing β (FR, PR, or HS)
923/// - **`restart_interval`**: Restart with steepest descent every n iterations (0 = no periodic restart)
924///
925/// # Example
926///
927/// ```
928/// use aprender::optim::{ConjugateGradient, CGBetaFormula, Optimizer};
929/// use aprender::primitives::Vector;
930///
931/// let mut optimizer = ConjugateGradient::new(100, 1e-5, CGBetaFormula::PolakRibiere);
932///
933/// // Minimize Rosenbrock function
934/// let f = |x: &Vector<f32>| {
935///     let a = x[0];
936///     let b = x[1];
937///     (1.0 - a).powi(2) + 100.0 * (b - a * a).powi(2)
938/// };
939///
940/// let grad = |x: &Vector<f32>| {
941///     let a = x[0];
942///     let b = x[1];
943///     Vector::from_slice(&[
944///         -2.0 * (1.0 - a) - 400.0 * a * (b - a * a),
945///         200.0 * (b - a * a),
946///     ])
947/// };
948///
949/// let x0 = Vector::from_slice(&[0.0, 0.0]);
950/// let result = optimizer.minimize(f, grad, x0);
951/// ```
952#[derive(Debug, Clone)]
953pub struct ConjugateGradient {
954    /// Maximum number of iterations
955    max_iter: usize,
956    /// Convergence tolerance (gradient norm)
957    tol: f32,
958    /// Beta computation formula
959    beta_formula: CGBetaFormula,
960    /// Restart interval (0 = no periodic restart, only on β < 0)
961    restart_interval: usize,
962    /// Line search strategy
963    line_search: WolfeLineSearch,
964    /// Previous search direction (for conjugacy)
965    prev_direction: Option<Vector<f32>>,
966    /// Previous gradient (for beta computation)
967    prev_gradient: Option<Vector<f32>>,
968    /// Iteration counter (for restart)
969    iter_count: usize,
970}
971
972impl ConjugateGradient {
973    /// Creates a new Conjugate Gradient optimizer.
974    ///
975    /// # Arguments
976    ///
977    /// * `max_iter` - Maximum number of iterations (typical: 100-1000)
978    /// * `tol` - Convergence tolerance for gradient norm (typical: 1e-5)
979    /// * `beta_formula` - Method for computing β (`FletcherReeves`, `PolakRibiere`, or `HestenesStiefel`)
980    ///
981    /// # Example
982    ///
983    /// ```
984    /// use aprender::optim::{ConjugateGradient, CGBetaFormula};
985    ///
986    /// let optimizer = ConjugateGradient::new(100, 1e-5, CGBetaFormula::PolakRibiere);
987    /// ```
988    #[must_use]
989    pub fn new(max_iter: usize, tol: f32, beta_formula: CGBetaFormula) -> Self {
990        Self {
991            max_iter,
992            tol,
993            beta_formula,
994            restart_interval: 0, // No periodic restart by default
995            line_search: WolfeLineSearch::new(1e-4, 0.1, 50), // c2=0.1 for CG (more exact line search)
996            prev_direction: None,
997            prev_gradient: None,
998            iter_count: 0,
999        }
1000    }
1001
1002    /// Sets the restart interval.
1003    ///
1004    /// CG will restart with steepest descent every n iterations.
1005    /// Setting to 0 disables periodic restart (only restarts on β < 0).
1006    ///
1007    /// # Arguments
1008    ///
1009    /// * `interval` - Number of iterations between restarts (typical: n, where n is problem dimension)
1010    ///
1011    /// # Example
1012    ///
1013    /// ```
1014    /// use aprender::optim::{ConjugateGradient, CGBetaFormula};
1015    ///
1016    /// let optimizer = ConjugateGradient::new(100, 1e-5, CGBetaFormula::PolakRibiere)
1017    ///     .with_restart_interval(50);
1018    /// ```
1019    #[must_use]
1020    pub fn with_restart_interval(mut self, interval: usize) -> Self {
1021        self.restart_interval = interval;
1022        self
1023    }
1024
1025    /// Computes beta coefficient based on the chosen formula.
1026    fn compute_beta(
1027        &self,
1028        grad_new: &Vector<f32>,
1029        grad_old: &Vector<f32>,
1030        d_old: &Vector<f32>,
1031    ) -> f32 {
1032        let n = grad_new.len();
1033
1034        match self.beta_formula {
1035            CGBetaFormula::FletcherReeves => {
1036                // β = (g_{k+1}^T g_{k+1}) / (g_k^T g_k)
1037                let mut numerator = 0.0;
1038                let mut denominator = 0.0;
1039                for i in 0..n {
1040                    numerator += grad_new[i] * grad_new[i];
1041                    denominator += grad_old[i] * grad_old[i];
1042                }
1043                numerator / denominator.max(1e-12)
1044            }
1045            CGBetaFormula::PolakRibiere => {
1046                // β = g_{k+1}^T (g_{k+1} - g_k) / (g_k^T g_k)
1047                let mut numerator = 0.0;
1048                let mut denominator = 0.0;
1049                for i in 0..n {
1050                    numerator += grad_new[i] * (grad_new[i] - grad_old[i]);
1051                    denominator += grad_old[i] * grad_old[i];
1052                }
1053                let beta = numerator / denominator.max(1e-12);
1054                // PR has automatic restart: if β < 0, restart with steepest descent
1055                beta.max(0.0)
1056            }
1057            CGBetaFormula::HestenesStiefel => {
1058                // β = g_{k+1}^T (g_{k+1} - g_k) / (d_k^T (g_{k+1} - g_k))
1059                let mut numerator = 0.0;
1060                let mut denominator = 0.0;
1061                for i in 0..n {
1062                    let y_i = grad_new[i] - grad_old[i];
1063                    numerator += grad_new[i] * y_i;
1064                    denominator += d_old[i] * y_i;
1065                }
1066                let beta = numerator / denominator.max(1e-12);
1067                beta.max(0.0)
1068            }
1069        }
1070    }
1071
1072    /// Computes the L2 norm of a vector.
1073    fn norm(v: &Vector<f32>) -> f32 {
1074        let mut sum = 0.0;
1075        for i in 0..v.len() {
1076            sum += v[i] * v[i];
1077        }
1078        sum.sqrt()
1079    }
1080}
1081
1082impl Optimizer for ConjugateGradient {
1083    fn step(&mut self, _params: &mut Vector<f32>, _gradients: &Vector<f32>) {
1084        unimplemented!(
1085            "Conjugate Gradient does not support stochastic updates (step). Use minimize() for batch optimization."
1086        )
1087    }
1088
1089    #[allow(clippy::too_many_lines)]
1090    fn minimize<F, G>(&mut self, objective: F, gradient: G, x0: Vector<f32>) -> OptimizationResult
1091    where
1092        F: Fn(&Vector<f32>) -> f32,
1093        G: Fn(&Vector<f32>) -> Vector<f32>,
1094    {
1095        let start_time = std::time::Instant::now();
1096        let n = x0.len();
1097
1098        // Reset state
1099        self.prev_direction = None;
1100        self.prev_gradient = None;
1101        self.iter_count = 0;
1102
1103        let mut x = x0;
1104        let mut fx = objective(&x);
1105        let mut grad = gradient(&x);
1106        let mut grad_norm = Self::norm(&grad);
1107
1108        for iter in 0..self.max_iter {
1109            // Check convergence
1110            if grad_norm < self.tol {
1111                return OptimizationResult {
1112                    solution: x,
1113                    objective_value: fx,
1114                    iterations: iter,
1115                    status: ConvergenceStatus::Converged,
1116                    gradient_norm: grad_norm,
1117                    constraint_violation: 0.0,
1118                    elapsed_time: start_time.elapsed(),
1119                };
1120            }
1121
1122            // Compute search direction
1123            let d = if let (Some(d_old), Some(g_old)) = (&self.prev_direction, &self.prev_gradient)
1124            {
1125                // Check if we need to restart
1126                let need_restart = if self.restart_interval > 0 {
1127                    self.iter_count % self.restart_interval == 0
1128                } else {
1129                    false
1130                };
1131
1132                if need_restart {
1133                    // Restart with steepest descent
1134                    let mut d_new = Vector::zeros(n);
1135                    for i in 0..n {
1136                        d_new[i] = -grad[i];
1137                    }
1138                    d_new
1139                } else {
1140                    // Compute beta and conjugate direction
1141                    let beta = self.compute_beta(&grad, g_old, d_old);
1142
1143                    // d = -grad + beta * d_old
1144                    let mut d_new = Vector::zeros(n);
1145                    for i in 0..n {
1146                        d_new[i] = -grad[i] + beta * d_old[i];
1147                    }
1148
1149                    // Check if direction is descent (grad^T d < 0)
1150                    let mut grad_dot_d = 0.0;
1151                    for i in 0..n {
1152                        grad_dot_d += grad[i] * d_new[i];
1153                    }
1154
1155                    if grad_dot_d >= 0.0 {
1156                        // Not a descent direction - restart with steepest descent
1157                        for i in 0..n {
1158                            d_new[i] = -grad[i];
1159                        }
1160                    }
1161
1162                    d_new
1163                }
1164            } else {
1165                // First iteration: use steepest descent
1166                let mut d_new = Vector::zeros(n);
1167                for i in 0..n {
1168                    d_new[i] = -grad[i];
1169                }
1170                d_new
1171            };
1172
1173            // Line search
1174            let alpha = self.line_search.search(&objective, &gradient, &x, &d);
1175
1176            // Check for stalled progress
1177            if alpha < 1e-12 {
1178                return OptimizationResult {
1179                    solution: x,
1180                    objective_value: fx,
1181                    iterations: iter,
1182                    status: ConvergenceStatus::Stalled,
1183                    gradient_norm: grad_norm,
1184                    constraint_violation: 0.0,
1185                    elapsed_time: start_time.elapsed(),
1186                };
1187            }
1188
1189            // Update position: x_new = x + alpha * d
1190            let mut x_new = Vector::zeros(n);
1191            for i in 0..n {
1192                x_new[i] = x[i] + alpha * d[i];
1193            }
1194
1195            // Compute new objective and gradient
1196            let fx_new = objective(&x_new);
1197            let grad_new = gradient(&x_new);
1198
1199            // Check for numerical errors
1200            if fx_new.is_nan() || fx_new.is_infinite() {
1201                return OptimizationResult {
1202                    solution: x,
1203                    objective_value: fx,
1204                    iterations: iter,
1205                    status: ConvergenceStatus::NumericalError,
1206                    gradient_norm: grad_norm,
1207                    constraint_violation: 0.0,
1208                    elapsed_time: start_time.elapsed(),
1209                };
1210            }
1211
1212            // Store current direction and gradient for next iteration
1213            self.prev_direction = Some(d);
1214            self.prev_gradient = Some(grad);
1215
1216            // Update for next iteration
1217            x = x_new;
1218            fx = fx_new;
1219            grad = grad_new;
1220            grad_norm = Self::norm(&grad);
1221            self.iter_count += 1;
1222        }
1223
1224        // Max iterations reached
1225        OptimizationResult {
1226            solution: x,
1227            objective_value: fx,
1228            iterations: self.max_iter,
1229            status: ConvergenceStatus::MaxIterations,
1230            gradient_norm: grad_norm,
1231            constraint_violation: 0.0,
1232            elapsed_time: start_time.elapsed(),
1233        }
1234    }
1235
1236    fn reset(&mut self) {
1237        self.prev_direction = None;
1238        self.prev_gradient = None;
1239        self.iter_count = 0;
1240    }
1241}
1242
1243/// Damped Newton optimizer with finite-difference Hessian approximation.
1244///
1245/// Newton's method uses second-order information (Hessian) to find the minimum
1246/// by solving the linear system: H * d = -g, where H is the Hessian and g is
1247/// the gradient. The damping factor and line search ensure global convergence.
1248///
1249/// # Algorithm
1250///
1251/// 1. Compute gradient g = ∇f(x)
1252/// 2. Approximate Hessian H using finite differences
1253/// 3. Solve H * d = -g using Cholesky decomposition
1254/// 4. If Hessian not positive definite, fall back to steepest descent
1255/// 5. Line search along d to find step size α
1256/// 6. Update: x_{k+1} = `x_k` + α * `d_k`
1257///
1258/// # Parameters
1259///
1260/// - **`max_iter`**: Maximum number of iterations
1261/// - **tol**: Convergence tolerance (gradient norm)
1262/// - **epsilon**: Finite difference step size for Hessian approximation (default: 1e-5)
1263///
1264/// # Example
1265///
1266/// ```
1267/// use aprender::optim::{DampedNewton, Optimizer};
1268/// use aprender::primitives::Vector;
1269///
1270/// let mut optimizer = DampedNewton::new(100, 1e-5);
1271///
1272/// // Minimize quadratic function f(x,y) = x^2 + 2y^2
1273/// let f = |x: &Vector<f32>| x[0] * x[0] + 2.0 * x[1] * x[1];
1274/// let grad = |x: &Vector<f32>| Vector::from_slice(&[2.0 * x[0], 4.0 * x[1]]);
1275///
1276/// let x0 = Vector::from_slice(&[5.0, 3.0]);
1277/// let result = optimizer.minimize(f, grad, x0);
1278/// ```
1279#[derive(Debug, Clone)]
1280pub struct DampedNewton {
1281    /// Maximum number of iterations
1282    max_iter: usize,
1283    /// Convergence tolerance (gradient norm)
1284    tol: f32,
1285    /// Finite difference step size for Hessian approximation
1286    epsilon: f32,
1287    /// Line search strategy
1288    line_search: BacktrackingLineSearch,
1289}
1290
1291impl DampedNewton {
1292    /// Creates a new Damped Newton optimizer.
1293    ///
1294    /// # Arguments
1295    ///
1296    /// * `max_iter` - Maximum number of iterations (typical: 100-1000)
1297    /// * `tol` - Convergence tolerance for gradient norm (typical: 1e-5)
1298    ///
1299    /// # Example
1300    ///
1301    /// ```
1302    /// use aprender::optim::DampedNewton;
1303    ///
1304    /// let optimizer = DampedNewton::new(100, 1e-5);
1305    /// ```
1306    #[must_use]
1307    pub fn new(max_iter: usize, tol: f32) -> Self {
1308        Self {
1309            max_iter,
1310            tol,
1311            epsilon: 1e-5, // Finite difference step size
1312            line_search: BacktrackingLineSearch::new(1e-4, 0.5, 50),
1313        }
1314    }
1315
1316    /// Sets the finite difference epsilon for Hessian approximation.
1317    ///
1318    /// # Arguments
1319    ///
1320    /// * `epsilon` - Step size for finite differences (typical: 1e-5 to 1e-8)
1321    ///
1322    /// # Example
1323    ///
1324    /// ```
1325    /// use aprender::optim::DampedNewton;
1326    ///
1327    /// let optimizer = DampedNewton::new(100, 1e-5).with_epsilon(1e-6);
1328    /// ```
1329    #[must_use]
1330    pub fn with_epsilon(mut self, epsilon: f32) -> Self {
1331        self.epsilon = epsilon;
1332        self
1333    }
1334
1335    /// Approximates the Hessian matrix using finite differences.
1336    ///
1337    /// Uses central differences: H[i,j] ≈ (∂`²f/∂x_i∂x_j`)
1338    fn approximate_hessian<G>(&self, grad: &G, x: &Vector<f32>) -> Matrix<f32>
1339    where
1340        G: Fn(&Vector<f32>) -> Vector<f32>,
1341    {
1342        let n = x.len();
1343        let mut h_data = vec![0.0; n * n];
1344
1345        let g0 = grad(x);
1346
1347        // Compute Hessian using finite differences
1348        for i in 0..n {
1349            // Perturb x[i] by epsilon
1350            let mut x_plus = x.clone();
1351            x_plus[i] += self.epsilon;
1352
1353            let g_plus = grad(&x_plus);
1354
1355            // Approximate column i of Hessian: H[:,i] ≈ (g(x+ε*e_i) - g(x)) / ε
1356            for j in 0..n {
1357                h_data[j * n + i] = (g_plus[j] - g0[j]) / self.epsilon;
1358            }
1359        }
1360
1361        // Symmetrize the Hessian (since it should be symmetric)
1362        for i in 0..n {
1363            for j in (i + 1)..n {
1364                let avg = (h_data[i * n + j] + h_data[j * n + i]) / 2.0;
1365                h_data[i * n + j] = avg;
1366                h_data[j * n + i] = avg;
1367            }
1368        }
1369
1370        Matrix::from_vec(n, n, h_data).expect("Matrix dimensions should be valid")
1371    }
1372
1373    /// Computes the L2 norm of a vector.
1374    fn norm(v: &Vector<f32>) -> f32 {
1375        let mut sum = 0.0;
1376        for i in 0..v.len() {
1377            sum += v[i] * v[i];
1378        }
1379        sum.sqrt()
1380    }
1381}
1382
1383impl Optimizer for DampedNewton {
1384    fn step(&mut self, _params: &mut Vector<f32>, _gradients: &Vector<f32>) {
1385        unimplemented!(
1386            "Damped Newton does not support stochastic updates (step). Use minimize() for batch optimization."
1387        )
1388    }
1389
1390    fn minimize<F, G>(&mut self, objective: F, gradient: G, x0: Vector<f32>) -> OptimizationResult
1391    where
1392        F: Fn(&Vector<f32>) -> f32,
1393        G: Fn(&Vector<f32>) -> Vector<f32>,
1394    {
1395        let start_time = std::time::Instant::now();
1396        let n = x0.len();
1397
1398        let mut x = x0;
1399        let mut fx = objective(&x);
1400        let mut grad = gradient(&x);
1401        let mut grad_norm = Self::norm(&grad);
1402
1403        for iter in 0..self.max_iter {
1404            // Check convergence
1405            if grad_norm < self.tol {
1406                return OptimizationResult {
1407                    solution: x,
1408                    objective_value: fx,
1409                    iterations: iter,
1410                    status: ConvergenceStatus::Converged,
1411                    gradient_norm: grad_norm,
1412                    constraint_violation: 0.0,
1413                    elapsed_time: start_time.elapsed(),
1414                };
1415            }
1416
1417            // Approximate Hessian
1418            let hessian = self.approximate_hessian(&gradient, &x);
1419
1420            // Negate gradient for solving H * d = -g
1421            let mut neg_grad = Vector::zeros(n);
1422            for i in 0..n {
1423                neg_grad[i] = -grad[i];
1424            }
1425
1426            // Solve H * d = -g using Cholesky decomposition
1427            let d = if let Ok(direction) = hessian.cholesky_solve(&neg_grad) {
1428                // Check if it's a descent direction
1429                let mut grad_dot_d = 0.0;
1430                for i in 0..n {
1431                    grad_dot_d += grad[i] * direction[i];
1432                }
1433
1434                if grad_dot_d < 0.0 {
1435                    // Valid descent direction from Newton step
1436                    direction
1437                } else {
1438                    // Not a descent direction - fall back to steepest descent
1439                    let mut sd = Vector::zeros(n);
1440                    for i in 0..n {
1441                        sd[i] = -grad[i];
1442                    }
1443                    sd
1444                }
1445            } else {
1446                // Hessian not positive definite - fall back to steepest descent
1447                let mut sd = Vector::zeros(n);
1448                for i in 0..n {
1449                    sd[i] = -grad[i];
1450                }
1451                sd
1452            };
1453
1454            // Line search
1455            let alpha = self.line_search.search(&objective, &gradient, &x, &d);
1456
1457            // Check for stalled progress
1458            if alpha < 1e-12 {
1459                return OptimizationResult {
1460                    solution: x,
1461                    objective_value: fx,
1462                    iterations: iter,
1463                    status: ConvergenceStatus::Stalled,
1464                    gradient_norm: grad_norm,
1465                    constraint_violation: 0.0,
1466                    elapsed_time: start_time.elapsed(),
1467                };
1468            }
1469
1470            // Update position: x_new = x + alpha * d
1471            let mut x_new = Vector::zeros(n);
1472            for i in 0..n {
1473                x_new[i] = x[i] + alpha * d[i];
1474            }
1475
1476            // Compute new objective and gradient
1477            let fx_new = objective(&x_new);
1478            let grad_new = gradient(&x_new);
1479
1480            // Check for numerical errors
1481            if fx_new.is_nan() || fx_new.is_infinite() {
1482                return OptimizationResult {
1483                    solution: x,
1484                    objective_value: fx,
1485                    iterations: iter,
1486                    status: ConvergenceStatus::NumericalError,
1487                    gradient_norm: grad_norm,
1488                    constraint_violation: 0.0,
1489                    elapsed_time: start_time.elapsed(),
1490                };
1491            }
1492
1493            // Update for next iteration
1494            x = x_new;
1495            fx = fx_new;
1496            grad = grad_new;
1497            grad_norm = Self::norm(&grad);
1498        }
1499
1500        // Max iterations reached
1501        OptimizationResult {
1502            solution: x,
1503            objective_value: fx,
1504            iterations: self.max_iter,
1505            status: ConvergenceStatus::MaxIterations,
1506            gradient_norm: grad_norm,
1507            constraint_violation: 0.0,
1508            elapsed_time: start_time.elapsed(),
1509        }
1510    }
1511
1512    fn reset(&mut self) {
1513        // Damped Newton is stateless - nothing to reset
1514    }
1515}
1516
1517// ==================== Proximal Operators ====================
1518
1519/// Proximal operators for non-smooth regularization.
1520///
1521/// A proximal operator for function g is defined as:
1522/// ```text
1523/// prox_g(v) = argmin_x { g(x) + ½‖x - v‖² }
1524/// ```
1525///
1526/// These are essential building blocks for proximal gradient methods like FISTA.
1527pub mod prox {
1528    use crate::primitives::Vector;
1529
1530    /// Soft-thresholding operator for L1 regularization.
1531    ///
1532    /// Computes the proximal operator of the L1 norm: prox_{λ‖·‖₁}(v).
1533    ///
1534    /// # Formula
1535    ///
1536    /// ```text
1537    /// prox_{λ‖·‖₁}(v) = sign(v) ⊙ max(|v| - λ, 0)
1538    /// ```
1539    ///
1540    /// # Arguments
1541    ///
1542    /// * `v` - Input vector
1543    /// * `lambda` - Regularization parameter (λ ≥ 0)
1544    ///
1545    /// # Returns
1546    ///
1547    /// Soft-thresholded vector
1548    ///
1549    /// # Example
1550    ///
1551    /// ```
1552    /// use aprender::optim::prox::soft_threshold;
1553    /// use aprender::primitives::Vector;
1554    ///
1555    /// let v = Vector::from_slice(&[2.0, -1.5, 0.5]);
1556    /// let result = soft_threshold(&v, 1.0);
1557    ///
1558    /// assert!((result[0] - 1.0).abs() < 1e-6);  // 2.0 - 1.0 = 1.0
1559    /// assert!((result[1] + 0.5).abs() < 1e-6);  // -1.5 + 1.0 = -0.5
1560    /// assert!(result[2].abs() < 1e-6);          // 0.5 - 1.0 = 0 (thresholded)
1561    /// ```
1562    ///
1563    /// # Use Cases
1564    ///
1565    /// - **Lasso regression**: Sparse linear models with L1 penalty
1566    /// - **Compressed sensing**: Sparse signal recovery
1567    /// - **Feature selection**: Automatic variable selection via sparsity
1568    #[must_use]
1569    pub fn soft_threshold(v: &Vector<f32>, lambda: f32) -> Vector<f32> {
1570        let mut result = Vector::zeros(v.len());
1571        for i in 0..v.len() {
1572            let val = v[i];
1573            result[i] = if val > lambda {
1574                val - lambda
1575            } else if val < -lambda {
1576                val + lambda
1577            } else {
1578                0.0
1579            };
1580        }
1581        result
1582    }
1583
1584    /// Projects onto the non-negative orthant: x ≥ 0.
1585    ///
1586    /// # Arguments
1587    ///
1588    /// * `x` - Input vector
1589    ///
1590    /// # Returns
1591    ///
1592    /// Vector with all negative components set to zero
1593    ///
1594    /// # Example
1595    ///
1596    /// ```
1597    /// use aprender::optim::prox::nonnegative;
1598    /// use aprender::primitives::Vector;
1599    ///
1600    /// let x = Vector::from_slice(&[1.0, -2.0, 3.0, -0.5]);
1601    /// let result = nonnegative(&x);
1602    ///
1603    /// assert_eq!(result[0], 1.0);
1604    /// assert_eq!(result[1], 0.0);
1605    /// assert_eq!(result[2], 3.0);
1606    /// assert_eq!(result[3], 0.0);
1607    /// ```
1608    #[must_use]
1609    pub fn nonnegative(x: &Vector<f32>) -> Vector<f32> {
1610        let mut result = Vector::zeros(x.len());
1611        for i in 0..x.len() {
1612            result[i] = x[i].max(0.0);
1613        }
1614        result
1615    }
1616
1617    /// Projects onto an L2 ball: ‖x‖₂ ≤ radius.
1618    ///
1619    /// # Arguments
1620    ///
1621    /// * `x` - Input vector
1622    /// * `radius` - Ball radius (r > 0)
1623    ///
1624    /// # Returns
1625    ///
1626    /// Projected vector satisfying ‖result‖₂ ≤ radius
1627    ///
1628    /// # Example
1629    ///
1630    /// ```
1631    /// use aprender::optim::prox::project_l2_ball;
1632    /// use aprender::primitives::Vector;
1633    ///
1634    /// let x = Vector::from_slice(&[3.0, 4.0]); // norm = 5.0
1635    /// let result = project_l2_ball(&x, 2.0);
1636    ///
1637    /// // Should be scaled to norm = 2.0
1638    /// let norm = (result[0] * result[0] + result[1] * result[1]).sqrt();
1639    /// assert!((norm - 2.0).abs() < 1e-5);
1640    /// ```
1641    #[must_use]
1642    pub fn project_l2_ball(x: &Vector<f32>, radius: f32) -> Vector<f32> {
1643        let mut norm_sq = 0.0;
1644        for i in 0..x.len() {
1645            norm_sq += x[i] * x[i];
1646        }
1647        let norm = norm_sq.sqrt();
1648
1649        if norm <= radius {
1650            x.clone()
1651        } else {
1652            let scale = radius / norm;
1653            let mut result = Vector::zeros(x.len());
1654            for i in 0..x.len() {
1655                result[i] = scale * x[i];
1656            }
1657            result
1658        }
1659    }
1660
1661    /// Projects onto box constraints: lower ≤ x ≤ upper.
1662    ///
1663    /// # Arguments
1664    ///
1665    /// * `x` - Input vector
1666    /// * `lower` - Lower bounds
1667    /// * `upper` - Upper bounds
1668    ///
1669    /// # Returns
1670    ///
1671    /// Vector with components clipped to [lower, upper]
1672    ///
1673    /// # Example
1674    ///
1675    /// ```
1676    /// use aprender::optim::prox::project_box;
1677    /// use aprender::primitives::Vector;
1678    ///
1679    /// let x = Vector::from_slice(&[-1.0, 0.5, 2.0]);
1680    /// let lower = Vector::from_slice(&[0.0, 0.0, 0.0]);
1681    /// let upper = Vector::from_slice(&[1.0, 1.0, 1.0]);
1682    ///
1683    /// let result = project_box(&x, &lower, &upper);
1684    ///
1685    /// assert_eq!(result[0], 0.0);  // Clipped to lower
1686    /// assert_eq!(result[1], 0.5);  // Within bounds
1687    /// assert_eq!(result[2], 1.0);  // Clipped to upper
1688    /// ```
1689    #[must_use]
1690    pub fn project_box(x: &Vector<f32>, lower: &Vector<f32>, upper: &Vector<f32>) -> Vector<f32> {
1691        let mut result = Vector::zeros(x.len());
1692        for i in 0..x.len() {
1693            result[i] = x[i].max(lower[i]).min(upper[i]);
1694        }
1695        result
1696    }
1697}
1698
1699// ==================== FISTA Optimizer ====================
1700
1701/// FISTA (Fast Iterative Shrinkage-Thresholding Algorithm).
1702///
1703/// Accelerated proximal gradient method for minimizing composite objectives:
1704/// ```text
1705/// minimize f(x) + g(x)
1706/// ```
1707/// where f is smooth and convex, g is convex (possibly non-smooth but "simple").
1708///
1709/// FISTA achieves O(1/k²) convergence rate using Nesterov acceleration,
1710/// compared to O(1/k) for standard proximal gradient (ISTA).
1711///
1712/// # Key Applications
1713///
1714/// - **Lasso regression**: f(x) = ½‖Ax - b‖², g(x) = λ‖x‖₁
1715/// - **Elastic Net**: f(x) = ½‖Ax - b‖², g(x) = λ₁‖x‖₁ + λ₂‖x‖₂²
1716/// - **Total variation**: Image denoising with TV regularization
1717/// - **Non-negative least squares**: f(x) = ½‖Ax - b‖², g(x) = indicator(x ≥ 0)
1718///
1719/// # Example
1720///
1721/// ```
1722/// use aprender::optim::{FISTA, Optimizer, prox};
1723/// use aprender::primitives::Vector;
1724///
1725/// // Minimize: ½(x - 5)² + 2|x|  (L1-regularized quadratic)
1726/// let smooth = |x: &Vector<f32>| 0.5 * (x[0] - 5.0).powi(2);
1727/// let grad_smooth = |x: &Vector<f32>| Vector::from_slice(&[x[0] - 5.0]);
1728/// let proximal = |v: &Vector<f32>, _alpha: f32| prox::soft_threshold(v, 2.0);
1729///
1730/// let mut fista = FISTA::new(1000, 0.1, 1e-5);
1731/// let x0 = Vector::from_slice(&[0.0]);
1732/// let result = fista.minimize(smooth, grad_smooth, proximal, x0);
1733///
1734/// // Check that optimization completed successfully
1735/// assert!(!result.solution[0].is_nan());
1736/// ```
1737///
1738/// # References
1739///
1740/// - Beck & Teboulle (2009). "A fast iterative shrinkage-thresholding algorithm
1741///   for linear inverse problems." SIAM Journal on Imaging Sciences, 2(1), 183-202.
1742#[derive(Debug, Clone)]
1743pub struct FISTA {
1744    /// Maximum number of iterations
1745    max_iter: usize,
1746    /// Step size (α > 0)
1747    step_size: f32,
1748    /// Convergence tolerance (‖xₖ₊₁ - xₖ‖ < tol)
1749    tol: f32,
1750}
1751
1752impl FISTA {
1753    /// Creates a new FISTA optimizer.
1754    ///
1755    /// # Arguments
1756    ///
1757    /// * `max_iter` - Maximum number of iterations
1758    /// * `step_size` - Step size α (should be ≤ 1/L where L is Lipschitz constant of ∇f)
1759    /// * `tol` - Convergence tolerance
1760    ///
1761    /// # Returns
1762    ///
1763    /// New FISTA optimizer instance
1764    ///
1765    /// # Example
1766    ///
1767    /// ```
1768    /// use aprender::optim::FISTA;
1769    ///
1770    /// let optimizer = FISTA::new(1000, 0.01, 1e-6);
1771    /// ```
1772    #[must_use]
1773    pub fn new(max_iter: usize, step_size: f32, tol: f32) -> Self {
1774        Self {
1775            max_iter,
1776            step_size,
1777            tol,
1778        }
1779    }
1780
1781    /// Minimizes a composite objective function using FISTA.
1782    ///
1783    /// Solves: minimize f(x) + g(x) where f is smooth, g is "simple" (has easy prox).
1784    ///
1785    /// # Type Parameters
1786    ///
1787    /// * `F` - Smooth objective function type
1788    /// * `G` - Gradient of smooth part type
1789    /// * `P` - Proximal operator type
1790    ///
1791    /// # Arguments
1792    ///
1793    /// * `smooth` - Smooth part f(x)
1794    /// * `grad_smooth` - Gradient ∇f(x)
1795    /// * `prox` - Proximal operator `prox_g(v`, α)
1796    /// * `x0` - Initial point
1797    ///
1798    /// # Returns
1799    ///
1800    /// [`OptimizationResult`] with solution and convergence information
1801    pub fn minimize<F, G, P>(
1802        &mut self,
1803        smooth: F,
1804        grad_smooth: G,
1805        prox: P,
1806        x0: Vector<f32>,
1807    ) -> OptimizationResult
1808    where
1809        F: Fn(&Vector<f32>) -> f32,
1810        G: Fn(&Vector<f32>) -> Vector<f32>,
1811        P: Fn(&Vector<f32>, f32) -> Vector<f32>,
1812    {
1813        let start_time = std::time::Instant::now();
1814
1815        let mut x = x0.clone();
1816        let mut y = x0;
1817        let mut t = 1.0; // Nesterov momentum parameter
1818
1819        for iter in 0..self.max_iter {
1820            // Proximal gradient step at y
1821            let grad_y = grad_smooth(&y);
1822
1823            // Compute: y - α * ∇f(y)
1824            let mut gradient_step = Vector::zeros(y.len());
1825            for i in 0..y.len() {
1826                gradient_step[i] = y[i] - self.step_size * grad_y[i];
1827            }
1828
1829            // Apply proximal operator
1830            let x_new = prox(&gradient_step, self.step_size);
1831
1832            // Check convergence
1833            let mut diff_norm = 0.0;
1834            for i in 0..x.len() {
1835                let diff = x_new[i] - x[i];
1836                diff_norm += diff * diff;
1837            }
1838            diff_norm = diff_norm.sqrt();
1839
1840            if diff_norm < self.tol {
1841                let final_obj = smooth(&x_new);
1842                return OptimizationResult {
1843                    solution: x_new,
1844                    objective_value: final_obj,
1845                    iterations: iter,
1846                    status: ConvergenceStatus::Converged,
1847                    gradient_norm: diff_norm, // Use step norm as proxy for gradient norm
1848                    constraint_violation: 0.0,
1849                    elapsed_time: start_time.elapsed(),
1850                };
1851            }
1852
1853            // Nesterov acceleration
1854            let t_new = (1.0_f32 + (1.0_f32 + 4.0_f32 * t * t).sqrt()) / 2.0_f32;
1855            let beta = (t - 1.0_f32) / t_new;
1856
1857            // y_new = x_new + β(x_new - x)
1858            let mut y_new = Vector::zeros(x.len());
1859            for i in 0..x.len() {
1860                y_new[i] = x_new[i] + beta * (x_new[i] - x[i]);
1861            }
1862
1863            x = x_new;
1864            y = y_new;
1865            t = t_new;
1866        }
1867
1868        // Max iterations reached
1869        let final_obj = smooth(&x);
1870        OptimizationResult {
1871            solution: x,
1872            objective_value: final_obj,
1873            iterations: self.max_iter,
1874            status: ConvergenceStatus::MaxIterations,
1875            gradient_norm: 0.0,
1876            constraint_violation: 0.0,
1877            elapsed_time: start_time.elapsed(),
1878        }
1879    }
1880}
1881
1882impl Optimizer for FISTA {
1883    fn step(&mut self, _params: &mut Vector<f32>, _gradients: &Vector<f32>) {
1884        unimplemented!(
1885            "FISTA does not support stochastic updates (step). Use minimize() for batch optimization with proximal operators."
1886        )
1887    }
1888
1889    fn reset(&mut self) {
1890        // FISTA is stateless - nothing to reset
1891    }
1892}
1893
1894// ==================== Coordinate Descent Optimizer ====================
1895
1896/// Coordinate Descent optimizer for high-dimensional problems.
1897///
1898/// Optimizes one coordinate at a time, which can be much more efficient than
1899/// full gradient methods when the number of features is very large (n ≫ m).
1900///
1901/// # Algorithm
1902///
1903/// ```text
1904/// for k = 1, 2, ..., max_iter:
1905///     for i = 1, 2, ..., n (cyclic or random order):
1906///         xᵢ ← argmin f(x₁, ..., xᵢ₋₁, xᵢ, xᵢ₊₁, ..., xₙ)
1907/// ```
1908///
1909/// # Key Applications
1910///
1911/// - **Lasso regression**: Coordinate descent with soft-thresholding (scikit-learn default)
1912/// - **Elastic Net**: L1 + L2 regularization
1913/// - **SVM**: Sequential Minimal Optimization (SMO) variant
1914/// - **High-dimensional statistics**: n ≫ m scenarios
1915///
1916/// # Advantages
1917///
1918/// - O(n) per coordinate update (vs O(n) for full gradient)
1919/// - No line search needed for many problems
1920/// - Handles non-differentiable objectives (e.g., L1)
1921/// - Cache-friendly memory access patterns
1922///
1923/// # Example
1924///
1925/// ```
1926/// use aprender::optim::{CoordinateDescent, Optimizer};
1927/// use aprender::primitives::Vector;
1928///
1929/// // Minimize: ½‖x - c‖² where c = [1, 2, 3]
1930/// // Coordinate i update: xᵢ = cᵢ (closed form)
1931/// let c = vec![1.0, 2.0, 3.0];
1932///
1933/// let update = move |x: &mut Vector<f32>, i: usize| {
1934///     x[i] = c[i]; // Closed-form solution for coordinate i
1935/// };
1936///
1937/// let mut cd = CoordinateDescent::new(100, 1e-6);
1938/// let x0 = Vector::from_slice(&[0.0, 0.0, 0.0]);
1939/// let result = cd.minimize(update, x0);
1940///
1941/// // Should converge to c
1942/// assert!((result.solution[0] - 1.0).abs() < 1e-5);
1943/// assert!((result.solution[1] - 2.0).abs() < 1e-5);
1944/// assert!((result.solution[2] - 3.0).abs() < 1e-5);
1945/// ```
1946///
1947/// # References
1948///
1949/// - Wright (2015). "Coordinate descent algorithms." Mathematical Programming.
1950/// - Friedman et al. (2010). "Regularization paths for generalized linear models via coordinate descent."
1951#[derive(Debug, Clone)]
1952pub struct CoordinateDescent {
1953    /// Maximum number of outer iterations (passes through all coordinates)
1954    max_iter: usize,
1955    /// Convergence tolerance (‖xₖ₊₁ - xₖ‖ < tol)
1956    tol: f32,
1957    /// Whether to use random coordinate order (vs cyclic)
1958    random_order: bool,
1959}
1960
1961impl CoordinateDescent {
1962    /// Creates a new Coordinate Descent optimizer.
1963    ///
1964    /// # Arguments
1965    ///
1966    /// * `max_iter` - Maximum number of passes through all coordinates
1967    /// * `tol` - Convergence tolerance
1968    ///
1969    /// # Returns
1970    ///
1971    /// New Coordinate Descent optimizer with cyclic coordinate order
1972    ///
1973    /// # Example
1974    ///
1975    /// ```
1976    /// use aprender::optim::CoordinateDescent;
1977    ///
1978    /// let optimizer = CoordinateDescent::new(1000, 1e-6);
1979    /// ```
1980    #[must_use]
1981    pub fn new(max_iter: usize, tol: f32) -> Self {
1982        Self {
1983            max_iter,
1984            tol,
1985            random_order: false,
1986        }
1987    }
1988
1989    /// Sets whether to use random coordinate order.
1990    ///
1991    /// # Arguments
1992    ///
1993    /// * `random` - If true, coordinates are updated in random order each iteration
1994    ///
1995    /// # Returns
1996    ///
1997    /// Self for method chaining
1998    ///
1999    /// # Example
2000    ///
2001    /// ```
2002    /// use aprender::optim::CoordinateDescent;
2003    ///
2004    /// let cd = CoordinateDescent::new(1000, 1e-6).with_random_order(true);
2005    /// ```
2006    #[must_use]
2007    pub fn with_random_order(mut self, random: bool) -> Self {
2008        self.random_order = random;
2009        self
2010    }
2011
2012    /// Minimizes an objective using coordinate descent.
2013    ///
2014    /// The user provides a coordinate update function that modifies one coordinate
2015    /// at a time. This function should solve:
2016    /// ```text
2017    /// xᵢ ← argmin f(x₁, ..., xᵢ₋₁, xᵢ, xᵢ₊₁, ..., xₙ)
2018    /// ```
2019    ///
2020    /// # Type Parameters
2021    ///
2022    /// * `U` - Coordinate update function type
2023    ///
2024    /// # Arguments
2025    ///
2026    /// * `update` - Function that updates coordinate i: `fn(&mut Vector, usize)`
2027    /// * `x0` - Initial point
2028    ///
2029    /// # Returns
2030    ///
2031    /// [`OptimizationResult`] with solution and convergence information
2032    ///
2033    /// # Example
2034    ///
2035    /// ```
2036    /// use aprender::optim::{CoordinateDescent, prox};
2037    /// use aprender::primitives::Vector;
2038    ///
2039    /// // Lasso coordinate update: soft-thresholding
2040    /// let lambda = 0.1;
2041    /// let update = move |x: &mut Vector<f32>, i: usize| {
2042    ///     // Simplified: actual Lasso requires computing residuals
2043    ///     let v = x[i];
2044    ///     x[i] = if v > lambda {
2045    ///         v - lambda
2046    ///     } else if v < -lambda {
2047    ///         v + lambda
2048    ///     } else {
2049    ///         0.0
2050    ///     };
2051    /// };
2052    ///
2053    /// let mut cd = CoordinateDescent::new(100, 1e-6);
2054    /// let x0 = Vector::from_slice(&[1.0, -0.5, 0.3]);
2055    /// let result = cd.minimize(update, x0);
2056    /// ```
2057    pub fn minimize<U>(&mut self, mut update: U, x0: Vector<f32>) -> OptimizationResult
2058    where
2059        U: FnMut(&mut Vector<f32>, usize),
2060    {
2061        let start_time = std::time::Instant::now();
2062        let n = x0.len();
2063
2064        let mut x = x0;
2065
2066        for iter in 0..self.max_iter {
2067            // Save previous iterate for convergence check
2068            let x_old = x.clone();
2069
2070            // Determine coordinate order
2071            if self.random_order {
2072                // Random permutation (Fisher-Yates shuffle)
2073                let mut indices: Vec<usize> = (0..n).collect();
2074                for i in (1..n).rev() {
2075                    let j = (i as f32 * 0.123456).rem_euclid(1.0); // Simple pseudo-random
2076                    let j = (j * (i + 1) as f32) as usize;
2077                    indices.swap(i, j);
2078                }
2079
2080                // Update in random order
2081                for i in indices {
2082                    update(&mut x, i);
2083                }
2084            } else {
2085                // Cyclic order
2086                for i in 0..n {
2087                    update(&mut x, i);
2088                }
2089            }
2090
2091            // Check convergence
2092            let mut diff_norm = 0.0;
2093            for i in 0..n {
2094                let diff = x[i] - x_old[i];
2095                diff_norm += diff * diff;
2096            }
2097            diff_norm = diff_norm.sqrt();
2098
2099            if diff_norm < self.tol {
2100                return OptimizationResult {
2101                    solution: x,
2102                    objective_value: 0.0, // Objective not tracked
2103                    iterations: iter,
2104                    status: ConvergenceStatus::Converged,
2105                    gradient_norm: diff_norm,
2106                    constraint_violation: 0.0,
2107                    elapsed_time: start_time.elapsed(),
2108                };
2109            }
2110        }
2111
2112        // Max iterations reached
2113        OptimizationResult {
2114            solution: x,
2115            objective_value: 0.0,
2116            iterations: self.max_iter,
2117            status: ConvergenceStatus::MaxIterations,
2118            gradient_norm: 0.0,
2119            constraint_violation: 0.0,
2120            elapsed_time: start_time.elapsed(),
2121        }
2122    }
2123}
2124
2125impl Optimizer for CoordinateDescent {
2126    fn step(&mut self, _params: &mut Vector<f32>, _gradients: &Vector<f32>) {
2127        unimplemented!(
2128            "Coordinate Descent does not support stochastic updates (step). Use minimize() with coordinate update function."
2129        )
2130    }
2131
2132    fn reset(&mut self) {
2133        // Coordinate Descent is stateless - nothing to reset
2134    }
2135}
2136
2137/// ADMM (Alternating Direction Method of Multipliers) for distributed and constrained optimization.
2138///
2139/// Solves problems of the form:
2140/// ```text
2141/// minimize  f(x) + g(z)
2142/// subject to Ax + Bz = c
2143/// ```
2144///
2145/// # Applications
2146///
2147/// - **Distributed Lasso**: Split data across workers for large-scale regression
2148/// - **Consensus optimization**: Average models from different sites (federated learning)
2149/// - **Constrained problems**: Equality-constrained optimization via consensus
2150/// - **Model parallelism**: Parallelize training across devices
2151///
2152/// # Algorithm
2153///
2154/// ADMM alternates between three updates:
2155///
2156/// 1. **x-update**: `x^{k+1} = argmin_x { f(x) + (ρ/2)‖Ax + Bz^k - c + u^k‖² }`
2157/// 2. **z-update**: `z^{k+1} = argmin_z { g(z) + (ρ/2)‖Ax^{k+1} + Bz - c + u^k‖² }`
2158/// 3. **u-update**: `u^{k+1} = u^k + (Ax^{k+1} + Bz^{k+1} - c)`
2159///
2160/// where u is the scaled dual variable and ρ is the penalty parameter.
2161///
2162/// # Convergence
2163///
2164/// - **Rate**: O(1/k) for convex f and g
2165/// - **Stopping criteria**: Both primal and dual residuals below tolerance
2166/// - **Adaptive ρ**: Automatically adjusts penalty parameter for faster convergence
2167///
2168/// # Example: Consensus Form (Lasso)
2169///
2170/// For Lasso regression with consensus constraint x = z:
2171/// ```rust
2172/// use aprender::optim::ADMM;
2173/// use aprender::primitives::{Vector, Matrix};
2174///
2175/// let n = 5;
2176/// let m = 10;
2177///
2178/// // Create problem data
2179/// let A = Matrix::eye(n); // Identity for consensus
2180/// let B = Matrix::eye(n);
2181/// let c = Vector::zeros(n);
2182///
2183/// // x-minimizer: least squares update
2184/// let data_matrix = Matrix::eye(m); // Your data matrix
2185/// let observations = Vector::ones(m); // Your observations
2186/// let lambda = 0.1;
2187///
2188/// let x_minimizer = |z: &Vector<f32>, u: &Vector<f32>, _c: &Vector<f32>, rho: f32| {
2189///     // Minimize ½‖Dx - b‖² + (ρ/2)‖x - z + u‖²
2190///     // Closed form: x = (DᵀD + ρI)⁻¹(Dᵀb + ρ(z - u))
2191///     let mut rhs = Vector::zeros(n);
2192///     for i in 0..n {
2193///         rhs[i] = rho * (z[i] - u[i]);
2194///     }
2195///     rhs // Simplified for example
2196/// };
2197///
2198/// // z-minimizer: soft-thresholding (proximal operator for L1)
2199/// let z_minimizer = |ax: &Vector<f32>, u: &Vector<f32>, _c: &Vector<f32>, rho: f32| {
2200///     let mut z = Vector::zeros(n);
2201///     for i in 0..n {
2202///         let v = ax[i] + u[i];
2203///         let threshold = lambda / rho;
2204///         z[i] = if v > threshold {
2205///             v - threshold
2206///         } else if v < -threshold {
2207///             v + threshold
2208///         } else {
2209///             0.0
2210///         };
2211///     }
2212///     z
2213/// };
2214///
2215/// let mut admm = ADMM::new(100, 1.0, 1e-4).with_adaptive_rho(true);
2216/// let x0 = Vector::zeros(n);
2217/// let z0 = Vector::zeros(n);
2218///
2219/// let result = admm.minimize_consensus(
2220///     x_minimizer,
2221///     z_minimizer,
2222///     &A,
2223///     &B,
2224///     &c,
2225///     x0,
2226///     z0,
2227/// );
2228/// ```
2229///
2230/// # Reference
2231///
2232/// Boyd, S., Parikh, N., Chu, E., Peleato, B., & Eckstein, J. (2011).
2233/// "Distributed Optimization and Statistical Learning via the Alternating Direction Method of Multipliers"
2234/// Foundations and Trends in Machine Learning, 3(1), 1-122.
2235#[derive(Debug, Clone)]
2236pub struct ADMM {
2237    /// Maximum number of iterations
2238    max_iter: usize,
2239    /// Penalty parameter (controls constraint enforcement)
2240    rho: f32,
2241    /// Tolerance for convergence (primal + dual residuals)
2242    tol: f32,
2243    /// Whether to adaptively adjust rho
2244    adaptive_rho: bool,
2245    /// Factor for increasing rho when primal residual is large
2246    rho_increase: f32,
2247    /// Factor for decreasing rho when dual residual is large
2248    rho_decrease: f32,
2249}
2250
2251impl ADMM {
2252    /// Creates a new ADMM optimizer.
2253    ///
2254    /// # Parameters
2255    ///
2256    /// - `max_iter`: Maximum number of iterations (typical: 100-1000)
2257    /// - `rho`: Penalty parameter (typical: 0.1-10.0, problem-dependent)
2258    /// - `tol`: Convergence tolerance for residuals (typical: 1e-4 to 1e-6)
2259    ///
2260    /// # Returns
2261    ///
2262    /// A new ADMM optimizer with default settings:
2263    /// - Adaptive rho: disabled (use `with_adaptive_rho(true)` to enable)
2264    /// - Rho increase factor: 2.0
2265    /// - Rho decrease factor: 2.0
2266    ///
2267    /// # Example
2268    ///
2269    /// ```
2270    /// use aprender::optim::ADMM;
2271    ///
2272    /// let admm = ADMM::new(100, 1.0, 1e-4);
2273    /// ```
2274    #[must_use]
2275    pub fn new(max_iter: usize, rho: f32, tol: f32) -> Self {
2276        Self {
2277            max_iter,
2278            rho,
2279            tol,
2280            adaptive_rho: false,
2281            rho_increase: 2.0,
2282            rho_decrease: 2.0,
2283        }
2284    }
2285
2286    /// Enables or disables adaptive penalty parameter adjustment.
2287    ///
2288    /// When enabled, rho is automatically adjusted based on the ratio of primal to dual residuals:
2289    /// - If primal residual >> dual residual: increase rho (enforce constraints more)
2290    /// - If dual residual >> primal residual: decrease rho (improve objective)
2291    ///
2292    /// # Example
2293    ///
2294    /// ```
2295    /// use aprender::optim::ADMM;
2296    ///
2297    /// let admm = ADMM::new(100, 1.0, 1e-4).with_adaptive_rho(true);
2298    /// ```
2299    #[must_use]
2300    pub fn with_adaptive_rho(mut self, adaptive: bool) -> Self {
2301        self.adaptive_rho = adaptive;
2302        self
2303    }
2304
2305    /// Sets the factors for adaptive rho adjustment.
2306    ///
2307    /// # Parameters
2308    ///
2309    /// - `increase`: Factor to multiply rho when primal residual is large (default: 2.0)
2310    /// - `decrease`: Factor to divide rho when dual residual is large (default: 2.0)
2311    #[must_use]
2312    pub fn with_rho_factors(mut self, increase: f32, decrease: f32) -> Self {
2313        self.rho_increase = increase;
2314        self.rho_decrease = decrease;
2315        self
2316    }
2317
2318    /// Minimizes a consensus-form ADMM problem.
2319    ///
2320    /// Solves: minimize f(x) + g(z) subject to Ax + Bz = c
2321    ///
2322    /// # Parameters
2323    ///
2324    /// - `x_minimizer`: Function that solves x-subproblem given (z, u, c, rho)
2325    /// - `z_minimizer`: Function that solves z-subproblem given (Ax, u, c, rho)
2326    /// - `A`, `B`, `c`: Constraint matrices and vector (Ax + Bz = c)
2327    /// - `x0`, `z0`: Initial values for x and z
2328    ///
2329    /// # Returns
2330    ///
2331    /// `OptimizationResult` containing the optimal x value and convergence information.
2332    ///
2333    /// # Minimizer Functions
2334    ///
2335    /// The `x_minimizer` should solve:
2336    /// ```text
2337    /// argmin_x { f(x) + (ρ/2)‖Ax + Bz - c + u‖² }
2338    /// ```
2339    ///
2340    /// The `z_minimizer` should solve:
2341    /// ```text
2342    /// argmin_z { g(z) + (ρ/2)‖Ax + Bz - c + u‖² }
2343    /// ```
2344    ///
2345    /// These often have closed-form solutions or can use proximal operators.
2346    #[allow(clippy::too_many_arguments)]
2347    pub fn minimize_consensus<F, G>(
2348        &mut self,
2349        x_minimizer: F,
2350        z_minimizer: G,
2351        a: &Matrix<f32>,
2352        b_mat: &Matrix<f32>,
2353        c: &Vector<f32>,
2354        x0: Vector<f32>,
2355        z0: Vector<f32>,
2356    ) -> OptimizationResult
2357    where
2358        F: Fn(&Vector<f32>, &Vector<f32>, &Vector<f32>, f32) -> Vector<f32>,
2359        G: Fn(&Vector<f32>, &Vector<f32>, &Vector<f32>, f32) -> Vector<f32>,
2360    {
2361        let start_time = std::time::Instant::now();
2362
2363        let mut x = x0;
2364        let mut z = z0;
2365        let mut u = Vector::zeros(c.len());
2366        let mut rho = self.rho;
2367
2368        let mut z_old = z.clone();
2369
2370        for iter in 0..self.max_iter {
2371            // x-update: minimize f(x) + (ρ/2)‖Ax + Bz - c + u‖²
2372            x = x_minimizer(&z, &u, c, rho);
2373
2374            // z-update: minimize g(z) + (ρ/2)‖Ax + Bz - c + u‖²
2375            let ax = a.matvec(&x).expect("Matrix-vector multiplication");
2376            z = z_minimizer(&ax, &u, c, rho);
2377
2378            // Compute residual: r = Ax + Bz - c
2379            let bz = b_mat.matvec(&z).expect("Matrix-vector multiplication");
2380            let residual = &(&ax + &bz) - c;
2381
2382            // u-update: u^{k+1} = u^k + r^{k+1}
2383            u = &u + &residual;
2384
2385            // Compute primal residual: ‖Ax + Bz - c‖
2386            let primal_res = residual.norm();
2387
2388            // Compute dual residual: ρ‖Bᵀ(z^{k+1} - z^k)‖
2389            let z_diff = &z - &z_old;
2390            let bt_z_diff = b_mat
2391                .transpose()
2392                .matvec(&z_diff)
2393                .expect("Matrix-vector multiplication");
2394            let dual_res = rho * bt_z_diff.norm();
2395
2396            // Check convergence
2397            if primal_res < self.tol && dual_res < self.tol {
2398                return OptimizationResult {
2399                    solution: x,
2400                    objective_value: 0.0, // Objective not tracked (requires f and g evaluations)
2401                    iterations: iter + 1,
2402                    status: ConvergenceStatus::Converged,
2403                    gradient_norm: dual_res,
2404                    constraint_violation: primal_res,
2405                    elapsed_time: start_time.elapsed(),
2406                };
2407            }
2408
2409            // Adaptive rho adjustment (Boyd et al. 2011, Section 3.4.1)
2410            if self.adaptive_rho && iter % 10 == 0 {
2411                if primal_res > 10.0 * dual_res {
2412                    // Primal residual is large: increase rho to enforce constraints
2413                    rho *= self.rho_increase;
2414                    // Rescale dual variable: u = u / rho_increase
2415                    let scale = 1.0 / self.rho_increase;
2416                    u = u.mul_scalar(scale);
2417                } else if dual_res > 10.0 * primal_res {
2418                    // Dual residual is large: decrease rho to improve objective
2419                    rho /= self.rho_decrease;
2420                    // Rescale dual variable: u = u * rho_decrease
2421                    u = u.mul_scalar(self.rho_decrease);
2422                }
2423            }
2424
2425            z_old = z.clone();
2426        }
2427
2428        // Max iterations reached
2429        OptimizationResult {
2430            solution: x,
2431            objective_value: 0.0,
2432            iterations: self.max_iter,
2433            status: ConvergenceStatus::MaxIterations,
2434            gradient_norm: 0.0,
2435            constraint_violation: 0.0,
2436            elapsed_time: start_time.elapsed(),
2437        }
2438    }
2439}
2440
2441impl Optimizer for ADMM {
2442    fn step(&mut self, _params: &mut Vector<f32>, _gradients: &Vector<f32>) {
2443        unimplemented!(
2444            "ADMM does not support stochastic updates (step). Use minimize_consensus() with x-minimizer and z-minimizer functions."
2445        )
2446    }
2447
2448    fn reset(&mut self) {
2449        // ADMM is stateless - nothing to reset
2450    }
2451}
2452
2453// ==================== Phase 3: Constrained Optimization ====================
2454
2455/// Projected Gradient Descent for constrained optimization.
2456///
2457/// Solves problems of the form:
2458/// ```text
2459/// minimize f(x)
2460/// subject to x ∈ C
2461/// ```
2462///
2463/// where C is a convex set with efficient projection operator.
2464///
2465/// # Algorithm
2466///
2467/// ```text
2468/// for k = 1, 2, ..., max_iter:
2469///     x_k+1 = P_C(x_k - α∇f(x_k))
2470/// ```
2471///
2472/// where `P_C` is the projection onto constraint set C.
2473///
2474/// # Key Applications
2475///
2476/// - **Non-negative least squares**: C = {x : x ≥ 0}
2477/// - **Box constraints**: C = {x : l ≤ x ≤ u}
2478/// - **L2 ball**: C = {x : ‖x‖₂ ≤ r}
2479/// - **Simplex**: C = {x : x ≥ 0, Σx = 1}
2480///
2481/// # Convergence
2482///
2483/// - For convex f: O(1/k) convergence rate
2484/// - For strongly convex f: Linear convergence
2485/// - Step size α can be constant or use line search
2486///
2487/// # Example
2488///
2489/// ```
2490/// use aprender::optim::{ProjectedGradientDescent, prox};
2491/// use aprender::primitives::Vector;
2492///
2493/// // Minimize: ½‖x - c‖² subject to x ≥ 0
2494/// let c = Vector::from_slice(&[1.0, -2.0, 3.0, -1.0]);
2495///
2496/// let objective = |x: &Vector<f32>| {
2497///     let mut obj = 0.0;
2498///     for i in 0..x.len() {
2499///         let diff = x[i] - c[i];
2500///         obj += 0.5 * diff * diff;
2501///     }
2502///     obj
2503/// };
2504///
2505/// let gradient = |x: &Vector<f32>| {
2506///     let mut grad = Vector::zeros(x.len());
2507///     for i in 0..x.len() {
2508///         grad[i] = x[i] - c[i];
2509///     }
2510///     grad
2511/// };
2512///
2513/// let project = |x: &Vector<f32>| prox::nonnegative(x);
2514///
2515/// let mut pgd = ProjectedGradientDescent::new(1000, 0.1, 1e-6);
2516/// let x0 = Vector::zeros(4);
2517/// let result = pgd.minimize(objective, gradient, project, x0);
2518///
2519/// // Solution should be max(c, 0) = [1.0, 0.0, 3.0, 0.0]
2520/// assert!((result.solution[0] - 1.0).abs() < 1e-4);
2521/// assert!(result.solution[1].abs() < 1e-4);
2522/// assert!((result.solution[2] - 3.0).abs() < 1e-4);
2523/// assert!(result.solution[3].abs() < 1e-4);
2524/// ```
2525///
2526/// # References
2527///
2528/// - Bertsekas (1999). "Nonlinear Programming."
2529/// - Beck & Teboulle (2009). "Gradient-based algorithms with applications to signal recovery."
2530#[derive(Debug, Clone)]
2531pub struct ProjectedGradientDescent {
2532    /// Maximum number of iterations
2533    max_iter: usize,
2534    /// Step size (learning rate)
2535    step_size: f32,
2536    /// Convergence tolerance
2537    tol: f32,
2538    /// Use backtracking line search
2539    use_line_search: bool,
2540    /// Backtracking parameter (0 < beta < 1)
2541    beta: f32,
2542}
2543
2544impl ProjectedGradientDescent {
2545    /// Creates a new Projected Gradient Descent optimizer.
2546    ///
2547    /// # Arguments
2548    ///
2549    /// * `max_iter` - Maximum number of iterations
2550    /// * `step_size` - Initial step size (fixed if line search disabled)
2551    /// * `tol` - Convergence tolerance
2552    ///
2553    /// # Example
2554    ///
2555    /// ```
2556    /// use aprender::optim::ProjectedGradientDescent;
2557    ///
2558    /// let optimizer = ProjectedGradientDescent::new(1000, 0.1, 1e-6);
2559    /// ```
2560    #[must_use]
2561    pub fn new(max_iter: usize, step_size: f32, tol: f32) -> Self {
2562        Self {
2563            max_iter,
2564            step_size,
2565            tol,
2566            use_line_search: false,
2567            beta: 0.5,
2568        }
2569    }
2570
2571    /// Enables backtracking line search.
2572    ///
2573    /// # Arguments
2574    ///
2575    /// * `beta` - Backtracking parameter (0 < beta < 1, typically 0.5)
2576    ///
2577    /// # Example
2578    ///
2579    /// ```
2580    /// use aprender::optim::ProjectedGradientDescent;
2581    ///
2582    /// let optimizer = ProjectedGradientDescent::new(1000, 1.0, 1e-6)
2583    ///     .with_line_search(0.5);
2584    /// ```
2585    #[must_use]
2586    pub fn with_line_search(mut self, beta: f32) -> Self {
2587        self.use_line_search = true;
2588        self.beta = beta;
2589        self
2590    }
2591
2592    /// Minimizes objective function subject to projection constraint.
2593    ///
2594    /// # Arguments
2595    ///
2596    /// * `objective` - Objective function f(x)
2597    /// * `gradient` - Gradient function ∇f(x)
2598    /// * `project` - Projection operator `P_C(x)`
2599    /// * `x0` - Initial point
2600    ///
2601    /// # Returns
2602    ///
2603    /// Optimization result with converged solution
2604    pub fn minimize<F, G, P>(
2605        &mut self,
2606        objective: F,
2607        gradient: G,
2608        project: P,
2609        x0: Vector<f32>,
2610    ) -> OptimizationResult
2611    where
2612        F: Fn(&Vector<f32>) -> f32,
2613        G: Fn(&Vector<f32>) -> Vector<f32>,
2614        P: Fn(&Vector<f32>) -> Vector<f32>,
2615    {
2616        let start_time = std::time::Instant::now();
2617
2618        let mut x = x0;
2619        let mut alpha = self.step_size;
2620
2621        for iter in 0..self.max_iter {
2622            // Compute gradient
2623            let grad = gradient(&x);
2624
2625            // Gradient step: y = x - α∇f(x)
2626            let mut y = Vector::zeros(x.len());
2627            for i in 0..x.len() {
2628                y[i] = x[i] - alpha * grad[i];
2629            }
2630
2631            // Project onto constraint set
2632            let x_new = project(&y);
2633
2634            // Line search if enabled
2635            if self.use_line_search {
2636                let f_x = objective(&x);
2637                let f_x_new = objective(&x_new);
2638
2639                // Backtracking: reduce step size until sufficient decrease
2640                let mut ls_iter = 0;
2641                while f_x_new > f_x && ls_iter < 20 {
2642                    alpha *= self.beta;
2643
2644                    for i in 0..x.len() {
2645                        y[i] = x[i] - alpha * grad[i];
2646                    }
2647                    let x_new_ls = project(&y);
2648
2649                    if objective(&x_new_ls) <= f_x {
2650                        break;
2651                    }
2652                    ls_iter += 1;
2653                }
2654            }
2655
2656            // Check convergence
2657            let mut diff_norm = 0.0;
2658            for i in 0..x.len() {
2659                let diff = x_new[i] - x[i];
2660                diff_norm += diff * diff;
2661            }
2662            diff_norm = diff_norm.sqrt();
2663
2664            // Compute gradient norm at new point
2665            let grad_new = gradient(&x_new);
2666            let mut grad_norm = 0.0;
2667            for i in 0..grad_new.len() {
2668                grad_norm += grad_new[i] * grad_new[i];
2669            }
2670            grad_norm = grad_norm.sqrt();
2671
2672            x = x_new;
2673
2674            if diff_norm < self.tol {
2675                let final_obj = objective(&x);
2676                return OptimizationResult {
2677                    solution: x,
2678                    objective_value: final_obj,
2679                    iterations: iter + 1,
2680                    status: ConvergenceStatus::Converged,
2681                    gradient_norm: grad_norm,
2682                    constraint_violation: 0.0,
2683                    elapsed_time: start_time.elapsed(),
2684                };
2685            }
2686        }
2687
2688        // Max iterations reached
2689        let final_obj = objective(&x);
2690        let grad_final = gradient(&x);
2691        let mut grad_norm = 0.0;
2692        for i in 0..grad_final.len() {
2693            grad_norm += grad_final[i] * grad_final[i];
2694        }
2695        grad_norm = grad_norm.sqrt();
2696
2697        OptimizationResult {
2698            solution: x,
2699            objective_value: final_obj,
2700            iterations: self.max_iter,
2701            status: ConvergenceStatus::MaxIterations,
2702            gradient_norm: grad_norm,
2703            constraint_violation: 0.0,
2704            elapsed_time: start_time.elapsed(),
2705        }
2706    }
2707}
2708
2709impl Optimizer for ProjectedGradientDescent {
2710    fn step(&mut self, _params: &mut Vector<f32>, _gradients: &Vector<f32>) {
2711        unimplemented!(
2712            "Projected Gradient Descent does not support stochastic updates (step). Use minimize() with projection operator."
2713        )
2714    }
2715
2716    fn reset(&mut self) {
2717        // Reset step size to initial value
2718        // Note: step_size is not stored separately, so nothing to reset
2719    }
2720}
2721
2722/// Augmented Lagrangian method for constrained optimization.
2723///
2724/// Solves problems with equality constraints:
2725/// ```text
2726/// minimize f(x)
2727/// subject to: h(x) = 0  (equality constraints)
2728/// ```
2729///
2730/// # Algorithm
2731///
2732/// ```text
2733/// Augmented Lagrangian: L_ρ(x, λ) = f(x) + λᵀh(x) + ½ρ‖h(x)‖²
2734///
2735/// for k = 1, 2, ..., max_iter:
2736///     x_k = argmin L_ρ(x, λ_k)
2737///     λ_k+1 = λ_k + ρ h(x_k)
2738///     if ‖h(x)‖ is small: converged
2739///     else: increase ρ
2740/// ```
2741///
2742/// # Key Features
2743///
2744/// - **Penalty parameter ρ**: Increased adaptively for faster convergence
2745/// - **Lagrange multipliers λ**: Automatically updated for equality constraints
2746/// - **Flexible subproblem solver**: Uses gradient descent for inner loop
2747/// - **Convergence**: Superlinear under regularity conditions
2748///
2749/// # Applications
2750///
2751/// - **Equality constraints**: Linear systems, manifold optimization
2752/// - **ADMM**: Alternating Direction Method of Multipliers (special case)
2753/// - **Consensus optimization**: Distributed optimization, federated learning
2754/// - **PDE-constrained optimization**: Physics-informed neural networks
2755///
2756/// # Example
2757///
2758/// ```
2759/// use aprender::optim::AugmentedLagrangian;
2760/// use aprender::primitives::Vector;
2761///
2762/// // Minimize: ½(x₁-2)² + ½(x₂-3)² subject to x₁ + x₂ = 1
2763/// let objective = |x: &Vector<f32>| {
2764///     0.5 * (x[0] - 2.0).powi(2) + 0.5 * (x[1] - 3.0).powi(2)
2765/// };
2766///
2767/// let gradient = |x: &Vector<f32>| {
2768///     Vector::from_slice(&[x[0] - 2.0, x[1] - 3.0])
2769/// };
2770///
2771/// // Equality constraint: x₁ + x₂ - 1 = 0
2772/// let equality = |x: &Vector<f32>| Vector::from_slice(&[x[0] + x[1] - 1.0]);
2773///
2774/// let equality_jac = |_x: &Vector<f32>| {
2775///     vec![Vector::from_slice(&[1.0, 1.0])]
2776/// };
2777///
2778/// let mut al = AugmentedLagrangian::new(100, 1e-6, 1.0);
2779/// let x0 = Vector::zeros(2);
2780/// let result = al.minimize_equality(objective, gradient, equality, equality_jac, x0);
2781///
2782/// assert!(result.constraint_violation < 1e-4);
2783/// ```
2784///
2785/// # References
2786///
2787/// - Nocedal & Wright (2006). "Numerical Optimization." Chapter 17.
2788/// - Bertsekas (1982). "Constrained Optimization and Lagrange Multiplier Methods."
2789#[derive(Debug, Clone)]
2790pub struct AugmentedLagrangian {
2791    /// Maximum number of outer iterations
2792    max_iter: usize,
2793    /// Convergence tolerance for constraint violation
2794    tol: f32,
2795    /// Initial penalty parameter
2796    initial_rho: f32,
2797    /// Current penalty parameter
2798    rho: f32,
2799    /// Penalty increase factor (> 1)
2800    rho_increase: f32,
2801    /// Maximum penalty parameter
2802    rho_max: f32,
2803}
2804
2805impl AugmentedLagrangian {
2806    /// Creates a new Augmented Lagrangian optimizer.
2807    ///
2808    /// # Arguments
2809    ///
2810    /// * `max_iter` - Maximum number of outer iterations
2811    /// * `tol` - Convergence tolerance for constraint violation
2812    /// * `initial_rho` - Initial penalty parameter (typically 1.0-10.0)
2813    ///
2814    /// # Example
2815    ///
2816    /// ```
2817    /// use aprender::optim::AugmentedLagrangian;
2818    ///
2819    /// let optimizer = AugmentedLagrangian::new(100, 1e-6, 1.0);
2820    /// ```
2821    #[must_use]
2822    pub fn new(max_iter: usize, tol: f32, initial_rho: f32) -> Self {
2823        Self {
2824            max_iter,
2825            tol,
2826            initial_rho,
2827            rho: initial_rho,
2828            rho_increase: 2.0,
2829            rho_max: 1e6,
2830        }
2831    }
2832
2833    /// Sets penalty increase factor.
2834    ///
2835    /// # Arguments
2836    ///
2837    /// * `factor` - Penalty increase factor (> 1, typically 2.0-10.0)
2838    ///
2839    /// # Example
2840    ///
2841    /// ```
2842    /// use aprender::optim::AugmentedLagrangian;
2843    ///
2844    /// let optimizer = AugmentedLagrangian::new(100, 1e-6, 1.0)
2845    ///     .with_rho_increase(5.0);
2846    /// ```
2847    #[must_use]
2848    pub fn with_rho_increase(mut self, factor: f32) -> Self {
2849        self.rho_increase = factor;
2850        self
2851    }
2852
2853    /// Minimizes objective subject to equality constraints.
2854    ///
2855    /// Solves: minimize f(x) subject to h(x) = 0
2856    ///
2857    /// # Arguments
2858    ///
2859    /// * `objective` - Objective function f(x)
2860    /// * `gradient` - Gradient ∇f(x)
2861    /// * `equality` - Equality constraints h(x) = 0 (returns vector)
2862    /// * `equality_jac` - Jacobian of equality constraints ∇h(x)
2863    /// * `x0` - Initial point
2864    ///
2865    /// # Returns
2866    ///
2867    /// Optimization result with constraint satisfaction metrics
2868    pub fn minimize_equality<F, G, H, J>(
2869        &mut self,
2870        objective: F,
2871        gradient: G,
2872        equality: H,
2873        equality_jac: J,
2874        x0: Vector<f32>,
2875    ) -> OptimizationResult
2876    where
2877        F: Fn(&Vector<f32>) -> f32,
2878        G: Fn(&Vector<f32>) -> Vector<f32>,
2879        H: Fn(&Vector<f32>) -> Vector<f32>,
2880        J: Fn(&Vector<f32>) -> Vec<Vector<f32>>,
2881    {
2882        let start_time = std::time::Instant::now();
2883
2884        let mut x = x0;
2885        self.rho = self.initial_rho;
2886
2887        // Initialize Lagrange multipliers to zero
2888        let h0 = equality(&x);
2889        let m = h0.len(); // Number of equality constraints
2890        let mut lambda = Vector::zeros(m);
2891
2892        for outer_iter in 0..self.max_iter {
2893            // Solve augmented Lagrangian subproblem: min L_ρ(x, λ)
2894            // L_ρ(x, λ) = f(x) + λᵀh(x) + ½ρ‖h(x)‖²
2895
2896            let aug_grad = |x_inner: &Vector<f32>| {
2897                let grad_f = gradient(x_inner);
2898                let h_val = equality(x_inner);
2899                let jac_h = equality_jac(x_inner);
2900
2901                let n = x_inner.len();
2902                let mut aug_g = Vector::zeros(n);
2903
2904                // ∇f(x)
2905                for i in 0..n {
2906                    aug_g[i] = grad_f[i];
2907                }
2908
2909                // Add ∇h(x)ᵀ(λ + ρh(x))
2910                for j in 0..m {
2911                    let coeff = lambda[j] + self.rho * h_val[j];
2912                    for i in 0..n {
2913                        aug_g[i] += coeff * jac_h[j][i];
2914                    }
2915                }
2916
2917                aug_g
2918            };
2919
2920            // Solve subproblem using gradient descent (simple solver)
2921            let mut x_sub = x.clone();
2922            let alpha = 0.01; // Fixed step size for subproblem
2923            for _sub_iter in 0..50 {
2924                let grad = aug_grad(&x_sub);
2925                let mut grad_norm_sq = 0.0;
2926                for i in 0..grad.len() {
2927                    grad_norm_sq += grad[i] * grad[i];
2928                }
2929                if grad_norm_sq < 1e-8 {
2930                    break; // Subproblem converged
2931                }
2932                for i in 0..x_sub.len() {
2933                    x_sub[i] -= alpha * grad[i];
2934                }
2935            }
2936
2937            x = x_sub;
2938
2939            // Update Lagrange multipliers: λ_k+1 = λ_k + ρ h(x_k)
2940            let h_val = equality(&x);
2941            for i in 0..m {
2942                lambda[i] += self.rho * h_val[i];
2943            }
2944
2945            // Check constraint violation
2946            let mut constraint_viol = 0.0;
2947            for i in 0..m {
2948                constraint_viol += h_val[i] * h_val[i];
2949            }
2950            constraint_viol = constraint_viol.sqrt();
2951
2952            // Check convergence
2953            if constraint_viol < self.tol {
2954                let final_obj = objective(&x);
2955                let grad_f = gradient(&x);
2956                let mut grad_norm = 0.0;
2957                for i in 0..grad_f.len() {
2958                    grad_norm += grad_f[i] * grad_f[i];
2959                }
2960                grad_norm = grad_norm.sqrt();
2961
2962                return OptimizationResult {
2963                    solution: x,
2964                    objective_value: final_obj,
2965                    iterations: outer_iter + 1,
2966                    status: ConvergenceStatus::Converged,
2967                    gradient_norm: grad_norm,
2968                    constraint_violation: constraint_viol,
2969                    elapsed_time: start_time.elapsed(),
2970                };
2971            }
2972
2973            // Increase penalty parameter if constraint violation is not decreasing fast enough
2974            if constraint_viol > 0.1 * self.tol && self.rho < self.rho_max {
2975                self.rho *= self.rho_increase;
2976            }
2977        }
2978
2979        // Max iterations reached
2980        let final_obj = objective(&x);
2981        let h_val = equality(&x);
2982        let mut constraint_viol = 0.0;
2983        for i in 0..h_val.len() {
2984            constraint_viol += h_val[i] * h_val[i];
2985        }
2986        constraint_viol = constraint_viol.sqrt();
2987
2988        let grad_f = gradient(&x);
2989        let mut grad_norm = 0.0;
2990        for i in 0..grad_f.len() {
2991            grad_norm += grad_f[i] * grad_f[i];
2992        }
2993        grad_norm = grad_norm.sqrt();
2994
2995        OptimizationResult {
2996            solution: x,
2997            objective_value: final_obj,
2998            iterations: self.max_iter,
2999            status: ConvergenceStatus::MaxIterations,
3000            gradient_norm: grad_norm,
3001            constraint_violation: constraint_viol,
3002            elapsed_time: start_time.elapsed(),
3003        }
3004    }
3005}
3006
3007impl Optimizer for AugmentedLagrangian {
3008    fn step(&mut self, _params: &mut Vector<f32>, _gradients: &Vector<f32>) {
3009        unimplemented!(
3010            "Augmented Lagrangian does not support stochastic updates (step). Use minimize_equality() for constrained optimization."
3011        )
3012    }
3013
3014    fn reset(&mut self) {
3015        // Reset penalty parameter to initial value
3016        self.rho = self.initial_rho;
3017    }
3018}
3019
3020/// Interior Point (Barrier) method for inequality-constrained optimization.
3021///
3022/// Solves problems with inequality constraints:
3023/// ```text
3024/// minimize f(x)
3025/// subject to: g(x) ≤ 0  (inequality constraints)
3026/// ```
3027///
3028/// # Algorithm
3029///
3030/// ```text
3031/// Barrier function: B_μ(x) = f(x) - μ Σ log(-g_i(x))
3032///
3033/// for k = 1, 2, ..., max_iter:
3034///     x_k = argmin B_μ(x)  (barrier subproblem)
3035///     μ_k+1 = β * μ_k      (decrease barrier parameter)
3036///     if ‖∇B_μ(x)‖ is small: converged
3037/// ```
3038///
3039/// # Key Features
3040///
3041/// - **Log-barrier**: Enforces g(x) < 0 via -μ log(-g_i(x))
3042/// - **Path-following**: Decreases μ → 0 to approach constrained optimum
3043/// - **Self-concordant**: Converges in O(√n log(1/ε)) iterations
3044/// - **Warm start**: Uses previous solution for next barrier value
3045///
3046/// # Applications
3047///
3048/// - **Linear programming**: Constraints Ax ≤ b
3049/// - **Quadratic programming**: QP with inequality constraints
3050/// - **Semidefinite programming**: Matrix constraints X ⪰ 0
3051/// - **Support Vector Machines**: Soft-margin constraints
3052/// - **Portfolio optimization**: Long-only constraints (x ≥ 0)
3053///
3054/// # Example
3055///
3056/// ```
3057/// use aprender::optim::InteriorPoint;
3058/// use aprender::primitives::Vector;
3059///
3060/// // Minimize: x₁² + x₂² subject to -x₁ ≤ 0, -x₂ ≤ 0 (i.e., x ≥ 0)
3061/// let objective = |x: &Vector<f32>| x[0] * x[0] + x[1] * x[1];
3062///
3063/// let gradient = |x: &Vector<f32>| Vector::from_slice(&[2.0 * x[0], 2.0 * x[1]]);
3064///
3065/// // Inequality constraints: g(x) = [-x₁, -x₂] ≤ 0
3066/// let inequality = |x: &Vector<f32>| Vector::from_slice(&[-x[0], -x[1]]);
3067///
3068/// let inequality_jac = |_x: &Vector<f32>| {
3069///     vec![Vector::from_slice(&[-1.0, 0.0]), Vector::from_slice(&[0.0, -1.0])]
3070/// };
3071///
3072/// let mut ip = InteriorPoint::new(50, 1e-6, 1.0);
3073/// let x0 = Vector::from_slice(&[1.0, 1.0]); // Feasible start
3074/// let result = ip.minimize(objective, gradient, inequality, inequality_jac, x0);
3075///
3076/// // Solution should be [0, 0] (constrained minimum)
3077/// assert!(result.solution[0].abs() < 1e-3);
3078/// assert!(result.solution[1].abs() < 1e-3);
3079/// ```
3080///
3081/// # References
3082///
3083/// - Nesterov & Nemirovskii (1994). "Interior-Point Polynomial Algorithms in Convex Programming."
3084/// - Boyd & Vandenberghe (2004). "Convex Optimization." Chapter 11.
3085/// - Wright (1997). "Primal-Dual Interior-Point Methods."
3086#[derive(Debug, Clone)]
3087pub struct InteriorPoint {
3088    /// Maximum number of outer iterations (barrier parameter updates)
3089    max_iter: usize,
3090    /// Convergence tolerance
3091    tol: f32,
3092    /// Initial barrier parameter
3093    initial_mu: f32,
3094    /// Current barrier parameter
3095    mu: f32,
3096    /// Barrier decrease factor (0 < beta < 1, typically 0.1-0.5)
3097    beta: f32,
3098}
3099
3100impl InteriorPoint {
3101    /// Creates a new Interior Point optimizer.
3102    ///
3103    /// # Arguments
3104    ///
3105    /// * `max_iter` - Maximum number of outer iterations
3106    /// * `tol` - Convergence tolerance
3107    /// * `initial_mu` - Initial barrier parameter (typically 1.0-10.0)
3108    ///
3109    /// # Example
3110    ///
3111    /// ```
3112    /// use aprender::optim::InteriorPoint;
3113    ///
3114    /// let optimizer = InteriorPoint::new(50, 1e-6, 1.0);
3115    /// ```
3116    #[must_use]
3117    pub fn new(max_iter: usize, tol: f32, initial_mu: f32) -> Self {
3118        Self {
3119            max_iter,
3120            tol,
3121            initial_mu,
3122            mu: initial_mu,
3123            beta: 0.2,
3124        }
3125    }
3126
3127    /// Sets barrier decrease factor.
3128    ///
3129    /// # Arguments
3130    ///
3131    /// * `beta` - Barrier decrease factor (0 < beta < 1, typically 0.1-0.5)
3132    ///
3133    /// # Example
3134    ///
3135    /// ```
3136    /// use aprender::optim::InteriorPoint;
3137    ///
3138    /// let optimizer = InteriorPoint::new(50, 1e-6, 1.0)
3139    ///     .with_beta(0.1); // Aggressive barrier decrease
3140    /// ```
3141    #[must_use]
3142    pub fn with_beta(mut self, beta: f32) -> Self {
3143        self.beta = beta;
3144        self
3145    }
3146
3147    /// Minimizes objective subject to inequality constraints.
3148    ///
3149    /// Solves: minimize f(x) subject to g(x) ≤ 0
3150    ///
3151    /// # Arguments
3152    ///
3153    /// * `objective` - Objective function f(x)
3154    /// * `gradient` - Gradient ∇f(x)
3155    /// * `inequality` - Inequality constraints g(x) ≤ 0 (returns vector)
3156    /// * `inequality_jac` - Jacobian of inequality constraints ∇g(x)
3157    /// * `x0` - Initial feasible point (must satisfy g(x0) < 0 strictly)
3158    ///
3159    /// # Returns
3160    ///
3161    /// Optimization result with constraint satisfaction metrics
3162    ///
3163    /// # Panics
3164    ///
3165    /// Panics if initial point is infeasible (g(x0) ≥ 0 for any constraint)
3166    #[allow(clippy::too_many_lines)]
3167    pub fn minimize<F, G, H, J>(
3168        &mut self,
3169        objective: F,
3170        gradient: G,
3171        inequality: H,
3172        inequality_jac: J,
3173        x0: Vector<f32>,
3174    ) -> OptimizationResult
3175    where
3176        F: Fn(&Vector<f32>) -> f32,
3177        G: Fn(&Vector<f32>) -> Vector<f32>,
3178        H: Fn(&Vector<f32>) -> Vector<f32>,
3179        J: Fn(&Vector<f32>) -> Vec<Vector<f32>>,
3180    {
3181        let start_time = std::time::Instant::now();
3182
3183        // Check initial feasibility
3184        let g0 = inequality(&x0);
3185        for i in 0..g0.len() {
3186            assert!(
3187                g0[i] < 0.0,
3188                "Initial point is infeasible: g[{}] = {} ≥ 0. Interior point requires strictly feasible start.",
3189                i, g0[i]
3190            );
3191        }
3192
3193        let mut x = x0;
3194        self.mu = self.initial_mu;
3195        let m = g0.len(); // Number of inequality constraints
3196
3197        for outer_iter in 0..self.max_iter {
3198            // Solve barrier subproblem: min B_μ(x) = f(x) - μ Σ log(-g_i(x))
3199
3200            let barrier_grad = |x_inner: &Vector<f32>| {
3201                let grad_f = gradient(x_inner);
3202                let g_val = inequality(x_inner);
3203                let jac_g = inequality_jac(x_inner);
3204
3205                let n = x_inner.len();
3206                let mut barrier_g = Vector::zeros(n);
3207
3208                // ∇f(x)
3209                for i in 0..n {
3210                    barrier_g[i] = grad_f[i];
3211                }
3212
3213                // Subtract μ Σ (1/(-g_i)) * ∇g_i(x)
3214                for j in 0..m {
3215                    if g_val[j] >= 0.0 {
3216                        // Hit constraint boundary - project back
3217                        continue;
3218                    }
3219                    let coeff = -self.mu / g_val[j];
3220                    for i in 0..n {
3221                        barrier_g[i] += coeff * jac_g[j][i];
3222                    }
3223                }
3224
3225                barrier_g
3226            };
3227
3228            // Solve barrier subproblem using gradient descent
3229            let mut x_sub = x.clone();
3230            let alpha = 0.01; // Fixed step size
3231            for _sub_iter in 0..50 {
3232                let grad = barrier_grad(&x_sub);
3233
3234                // Check if gradient is small (converged)
3235                let mut grad_norm_sq = 0.0;
3236                for i in 0..grad.len() {
3237                    grad_norm_sq += grad[i] * grad[i];
3238                }
3239                if grad_norm_sq < 1e-8 {
3240                    break;
3241                }
3242
3243                // Gradient descent step
3244                for i in 0..x_sub.len() {
3245                    x_sub[i] -= alpha * grad[i];
3246                }
3247
3248                // Check feasibility - if we violated constraints, step back
3249                let g_sub = inequality(&x_sub);
3250                let mut infeasible = false;
3251                for i in 0..m {
3252                    if g_sub[i] >= -1e-8 {
3253                        // Close to or past boundary
3254                        infeasible = true;
3255                        break;
3256                    }
3257                }
3258                if infeasible {
3259                    // Step back
3260                    for i in 0..x_sub.len() {
3261                        x_sub[i] += alpha * grad[i] * 0.5; // Half step back
3262                    }
3263                }
3264            }
3265
3266            x = x_sub;
3267
3268            // Check convergence via gradient of barrier function
3269            let grad_barrier = barrier_grad(&x);
3270            let mut grad_norm = 0.0;
3271            for i in 0..grad_barrier.len() {
3272                grad_norm += grad_barrier[i] * grad_barrier[i];
3273            }
3274            grad_norm = grad_norm.sqrt();
3275
3276            // Also check constraint violation
3277            let g_val = inequality(&x);
3278            let mut max_violation = 0.0;
3279            for i in 0..m {
3280                if g_val[i] > max_violation {
3281                    max_violation = g_val[i];
3282                }
3283            }
3284
3285            // Converged if gradient is small and μ is small
3286            if grad_norm < self.tol && self.mu < 1e-4 {
3287                let final_obj = objective(&x);
3288                return OptimizationResult {
3289                    solution: x,
3290                    objective_value: final_obj,
3291                    iterations: outer_iter + 1,
3292                    status: ConvergenceStatus::Converged,
3293                    gradient_norm: grad_norm,
3294                    constraint_violation: max_violation.max(0.0),
3295                    elapsed_time: start_time.elapsed(),
3296                };
3297            }
3298
3299            // Decrease barrier parameter
3300            self.mu *= self.beta;
3301        }
3302
3303        // Max iterations reached
3304        let final_obj = objective(&x);
3305        let g_val = inequality(&x);
3306        let mut max_violation = 0.0;
3307        for i in 0..g_val.len() {
3308            if g_val[i] > max_violation {
3309                max_violation = g_val[i];
3310            }
3311        }
3312
3313        let grad_f = gradient(&x);
3314        let mut grad_norm = 0.0;
3315        for i in 0..grad_f.len() {
3316            grad_norm += grad_f[i] * grad_f[i];
3317        }
3318        grad_norm = grad_norm.sqrt();
3319
3320        OptimizationResult {
3321            solution: x,
3322            objective_value: final_obj,
3323            iterations: self.max_iter,
3324            status: ConvergenceStatus::MaxIterations,
3325            gradient_norm: grad_norm,
3326            constraint_violation: max_violation.max(0.0),
3327            elapsed_time: start_time.elapsed(),
3328        }
3329    }
3330}
3331
3332impl Optimizer for InteriorPoint {
3333    fn step(&mut self, _params: &mut Vector<f32>, _gradients: &Vector<f32>) {
3334        unimplemented!(
3335            "Interior Point does not support stochastic updates (step). Use minimize() for constrained optimization."
3336        )
3337    }
3338
3339    fn reset(&mut self) {
3340        // Reset barrier parameter to initial value
3341        self.mu = self.initial_mu;
3342    }
3343}
3344
3345/// Stochastic Gradient Descent optimizer.
3346///
3347/// SGD updates parameters using the gradient of the loss function:
3348///
3349/// ```text
3350/// θ = θ - η * ∇L(θ)
3351/// ```
3352///
3353/// With momentum:
3354///
3355/// ```text
3356/// v = γ * v + η * ∇L(θ)
3357/// θ = θ - v
3358/// ```
3359///
3360/// where:
3361/// - θ is the parameter vector
3362/// - η is the learning rate
3363/// - γ is the momentum coefficient
3364/// - v is the velocity vector
3365/// - ∇L(θ) is the gradient of the loss
3366///
3367/// # Example
3368///
3369/// ```
3370/// use aprender::optim::SGD;
3371/// use aprender::primitives::Vector;
3372///
3373/// // Create SGD with momentum
3374/// let mut optimizer = SGD::new(0.1).with_momentum(0.9);
3375///
3376/// let mut params = Vector::from_slice(&[0.0, 0.0]);
3377/// let gradients = Vector::from_slice(&[1.0, 2.0]);
3378///
3379/// // First step
3380/// optimizer.step(&mut params, &gradients);
3381///
3382/// // With momentum, velocity builds up over iterations
3383/// optimizer.step(&mut params, &gradients);
3384/// ```
3385#[derive(Debug, Clone, Serialize, Deserialize)]
3386pub struct SGD {
3387    /// Learning rate (step size)
3388    learning_rate: f32,
3389    /// Momentum coefficient (0.0 = no momentum)
3390    momentum: f32,
3391    /// Velocity vectors for momentum
3392    velocity: Option<Vec<f32>>,
3393}
3394
3395impl SGD {
3396    /// Creates a new SGD optimizer with the given learning rate.
3397    ///
3398    /// # Arguments
3399    ///
3400    /// * `learning_rate` - Step size for parameter updates
3401    ///
3402    /// # Example
3403    ///
3404    /// ```
3405    /// use aprender::optim::SGD;
3406    ///
3407    /// let optimizer = SGD::new(0.01);
3408    /// assert!((optimizer.learning_rate() - 0.01).abs() < 1e-6);
3409    /// ```
3410    #[must_use]
3411    pub fn new(learning_rate: f32) -> Self {
3412        Self {
3413            learning_rate,
3414            momentum: 0.0,
3415            velocity: None,
3416        }
3417    }
3418
3419    /// Sets the momentum coefficient.
3420    ///
3421    /// Momentum helps accelerate SGD in the relevant direction and dampens
3422    /// oscillations. Typical values are 0.9 or 0.99.
3423    ///
3424    /// # Arguments
3425    ///
3426    /// * `momentum` - Momentum coefficient between 0.0 and 1.0
3427    ///
3428    /// # Example
3429    ///
3430    /// ```
3431    /// use aprender::optim::SGD;
3432    ///
3433    /// let optimizer = SGD::new(0.01).with_momentum(0.9);
3434    /// assert!((optimizer.momentum() - 0.9).abs() < 1e-6);
3435    /// ```
3436    #[must_use]
3437    pub fn with_momentum(mut self, momentum: f32) -> Self {
3438        self.momentum = momentum;
3439        self
3440    }
3441
3442    /// Returns the learning rate.
3443    #[must_use]
3444    pub fn learning_rate(&self) -> f32 {
3445        self.learning_rate
3446    }
3447
3448    /// Returns the momentum coefficient.
3449    #[must_use]
3450    pub fn momentum(&self) -> f32 {
3451        self.momentum
3452    }
3453
3454    /// Updates parameters using gradients.
3455    ///
3456    /// If momentum is enabled, maintains velocity vectors for each parameter.
3457    ///
3458    /// # Arguments
3459    ///
3460    /// * `params` - Mutable reference to parameter vector
3461    /// * `gradients` - Gradient vector (same length as params)
3462    ///
3463    /// # Panics
3464    ///
3465    /// Panics if params and gradients have different lengths.
3466    ///
3467    /// # Example
3468    ///
3469    /// ```
3470    /// use aprender::optim::SGD;
3471    /// use aprender::primitives::Vector;
3472    ///
3473    /// let mut optimizer = SGD::new(0.1);
3474    /// let mut params = Vector::from_slice(&[1.0, 2.0]);
3475    /// let gradients = Vector::from_slice(&[0.5, 1.0]);
3476    ///
3477    /// optimizer.step(&mut params, &gradients);
3478    ///
3479    /// // params = [1.0 - 0.1*0.5, 2.0 - 0.1*1.0] = [0.95, 1.9]
3480    /// assert!((params[0] - 0.95).abs() < 1e-6);
3481    /// assert!((params[1] - 1.9).abs() < 1e-6);
3482    /// ```
3483    pub fn step(&mut self, params: &mut Vector<f32>, gradients: &Vector<f32>) {
3484        assert_eq!(
3485            params.len(),
3486            gradients.len(),
3487            "Parameters and gradients must have same length"
3488        );
3489
3490        let n = params.len();
3491
3492        if self.momentum > 0.0 {
3493            // Initialize velocity if needed
3494            if self.velocity.is_none()
3495                || self
3496                    .velocity
3497                    .as_ref()
3498                    .expect("Velocity must be initialized")
3499                    .len()
3500                    != n
3501            {
3502                self.velocity = Some(vec![0.0; n]);
3503            }
3504
3505            let velocity = self
3506                .velocity
3507                .as_mut()
3508                .expect("Velocity was just initialized");
3509
3510            for i in 0..n {
3511                // v = γ * v + η * gradient
3512                velocity[i] = self.momentum * velocity[i] + self.learning_rate * gradients[i];
3513                // θ = θ - v
3514                params[i] -= velocity[i];
3515            }
3516        } else {
3517            // Standard SGD: θ = θ - η * gradient
3518            for i in 0..n {
3519                params[i] -= self.learning_rate * gradients[i];
3520            }
3521        }
3522    }
3523
3524    /// Resets the optimizer state (velocity vectors).
3525    ///
3526    /// Call this when starting training on a new model or after significant
3527    /// changes to the optimization problem.
3528    ///
3529    /// # Example
3530    ///
3531    /// ```
3532    /// use aprender::optim::SGD;
3533    /// use aprender::primitives::Vector;
3534    ///
3535    /// let mut optimizer = SGD::new(0.1).with_momentum(0.9);
3536    /// let mut params = Vector::from_slice(&[1.0]);
3537    /// let gradients = Vector::from_slice(&[1.0]);
3538    ///
3539    /// optimizer.step(&mut params, &gradients);
3540    /// optimizer.reset();
3541    ///
3542    /// // Velocity is now reset to zero
3543    /// ```
3544    pub fn reset(&mut self) {
3545        self.velocity = None;
3546    }
3547
3548    /// Returns whether momentum is enabled.
3549    #[must_use]
3550    pub fn has_momentum(&self) -> bool {
3551        self.momentum > 0.0
3552    }
3553}
3554
3555/// Adam (Adaptive Moment Estimation) optimizer.
3556///
3557/// Adam combines the benefits of `AdaGrad` and `RMSprop` by computing adaptive learning
3558/// rates for each parameter using estimates of first and second moments of gradients.
3559///
3560/// Update rules:
3561///
3562/// ```text
3563/// m_t = β₁ * m_{t-1} + (1 - β₁) * g_t
3564/// v_t = β₂ * v_{t-1} + (1 - β₂) * g_t²
3565/// m̂_t = m_t / (1 - β₁^t)
3566/// v̂_t = v_t / (1 - β₂^t)
3567/// θ_t = θ_{t-1} - α * m̂_t / (√v̂_t + ε)
3568/// ```
3569///
3570/// where:
3571/// - `m_t` is the first moment (mean) estimate
3572/// - `v_t` is the second moment (variance) estimate
3573/// - β₁, β₂ are exponential decay rates (typically 0.9, 0.999)
3574/// - α is the learning rate (step size)
3575/// - ε is a small constant for numerical stability (typically 1e-8)
3576///
3577/// # Example
3578///
3579/// ```
3580/// use aprender::optim::Adam;
3581/// use aprender::primitives::Vector;
3582///
3583/// // Create Adam optimizer with default hyperparameters
3584/// let mut optimizer = Adam::new(0.001);
3585///
3586/// let mut params = Vector::from_slice(&[1.0, 2.0]);
3587/// let gradients = Vector::from_slice(&[0.1, 0.2]);
3588///
3589/// // Update parameters with adaptive learning rates
3590/// optimizer.step(&mut params, &gradients);
3591/// ```
3592#[derive(Debug, Clone, Serialize, Deserialize)]
3593pub struct Adam {
3594    /// Learning rate (step size)
3595    learning_rate: f32,
3596    /// Exponential decay rate for first moment estimates (default: 0.9)
3597    beta1: f32,
3598    /// Exponential decay rate for second moment estimates (default: 0.999)
3599    beta2: f32,
3600    /// Small constant for numerical stability (default: 1e-8)
3601    epsilon: f32,
3602    /// First moment estimates (mean)
3603    m: Option<Vec<f32>>,
3604    /// Second moment estimates (uncentered variance)
3605    v: Option<Vec<f32>>,
3606    /// Number of steps taken (for bias correction)
3607    t: usize,
3608}
3609
3610impl Adam {
3611    /// Creates a new Adam optimizer with the given learning rate and default hyperparameters.
3612    ///
3613    /// Default values:
3614    /// - beta1 = 0.9
3615    /// - beta2 = 0.999
3616    /// - epsilon = 1e-8
3617    ///
3618    /// # Arguments
3619    ///
3620    /// * `learning_rate` - Step size (typical values: 0.001, 0.0001)
3621    ///
3622    /// # Example
3623    ///
3624    /// ```
3625    /// use aprender::optim::Adam;
3626    ///
3627    /// let optimizer = Adam::new(0.001);
3628    /// assert!((optimizer.learning_rate() - 0.001).abs() < 1e-9);
3629    /// ```
3630    #[must_use]
3631    pub fn new(learning_rate: f32) -> Self {
3632        Self {
3633            learning_rate,
3634            beta1: 0.9,
3635            beta2: 0.999,
3636            epsilon: 1e-8,
3637            m: None,
3638            v: None,
3639            t: 0,
3640        }
3641    }
3642
3643    /// Sets the beta1 parameter (exponential decay rate for first moment).
3644    ///
3645    /// # Arguments
3646    ///
3647    /// * `beta1` - Value between 0.0 and 1.0 (typical: 0.9)
3648    ///
3649    /// # Example
3650    ///
3651    /// ```
3652    /// use aprender::optim::Adam;
3653    ///
3654    /// let optimizer = Adam::new(0.001).with_beta1(0.95);
3655    /// assert!((optimizer.beta1() - 0.95).abs() < 1e-9);
3656    /// ```
3657    #[must_use]
3658    pub fn with_beta1(mut self, beta1: f32) -> Self {
3659        self.beta1 = beta1;
3660        self
3661    }
3662
3663    /// Sets the beta2 parameter (exponential decay rate for second moment).
3664    ///
3665    /// # Arguments
3666    ///
3667    /// * `beta2` - Value between 0.0 and 1.0 (typical: 0.999)
3668    ///
3669    /// # Example
3670    ///
3671    /// ```
3672    /// use aprender::optim::Adam;
3673    ///
3674    /// let optimizer = Adam::new(0.001).with_beta2(0.9999);
3675    /// assert!((optimizer.beta2() - 0.9999).abs() < 1e-9);
3676    /// ```
3677    #[must_use]
3678    pub fn with_beta2(mut self, beta2: f32) -> Self {
3679        self.beta2 = beta2;
3680        self
3681    }
3682
3683    /// Sets the epsilon parameter (numerical stability constant).
3684    ///
3685    /// # Arguments
3686    ///
3687    /// * `epsilon` - Small positive value (typical: 1e-8)
3688    ///
3689    /// # Example
3690    ///
3691    /// ```
3692    /// use aprender::optim::Adam;
3693    ///
3694    /// let optimizer = Adam::new(0.001).with_epsilon(1e-7);
3695    /// assert!((optimizer.epsilon() - 1e-7).abs() < 1e-15);
3696    /// ```
3697    #[must_use]
3698    pub fn with_epsilon(mut self, epsilon: f32) -> Self {
3699        self.epsilon = epsilon;
3700        self
3701    }
3702
3703    /// Returns the learning rate.
3704    #[must_use]
3705    pub fn learning_rate(&self) -> f32 {
3706        self.learning_rate
3707    }
3708
3709    /// Returns the beta1 parameter.
3710    #[must_use]
3711    pub fn beta1(&self) -> f32 {
3712        self.beta1
3713    }
3714
3715    /// Returns the beta2 parameter.
3716    #[must_use]
3717    pub fn beta2(&self) -> f32 {
3718        self.beta2
3719    }
3720
3721    /// Returns the epsilon parameter.
3722    #[must_use]
3723    pub fn epsilon(&self) -> f32 {
3724        self.epsilon
3725    }
3726
3727    /// Returns the number of steps taken.
3728    #[must_use]
3729    pub fn steps(&self) -> usize {
3730        self.t
3731    }
3732
3733    /// Updates parameters using gradients with adaptive learning rates.
3734    ///
3735    /// # Arguments
3736    ///
3737    /// * `params` - Mutable reference to parameter vector
3738    /// * `gradients` - Gradient vector (same length as params)
3739    ///
3740    /// # Panics
3741    ///
3742    /// Panics if params and gradients have different lengths.
3743    ///
3744    /// # Example
3745    ///
3746    /// ```
3747    /// use aprender::optim::Adam;
3748    /// use aprender::primitives::Vector;
3749    ///
3750    /// let mut optimizer = Adam::new(0.001);
3751    /// let mut params = Vector::from_slice(&[1.0, 2.0]);
3752    /// let gradients = Vector::from_slice(&[0.1, 0.2]);
3753    ///
3754    /// optimizer.step(&mut params, &gradients);
3755    /// ```
3756    pub fn step(&mut self, params: &mut Vector<f32>, gradients: &Vector<f32>) {
3757        assert_eq!(
3758            params.len(),
3759            gradients.len(),
3760            "Parameters and gradients must have same length"
3761        );
3762
3763        let n = params.len();
3764
3765        // Initialize moment estimates if needed
3766        if self.m.is_none()
3767            || self
3768                .m
3769                .as_ref()
3770                .expect("First moment estimate must be initialized")
3771                .len()
3772                != n
3773        {
3774            self.m = Some(vec![0.0; n]);
3775            self.v = Some(vec![0.0; n]);
3776            self.t = 0;
3777        }
3778
3779        self.t += 1;
3780        let t = self.t as f32;
3781
3782        let m = self.m.as_mut().expect("First moment was just initialized");
3783        let v = self.v.as_mut().expect("Second moment was just initialized");
3784
3785        // Compute bias-corrected learning rate
3786        let lr_t =
3787            self.learning_rate * (1.0 - self.beta2.powf(t)).sqrt() / (1.0 - self.beta1.powf(t));
3788
3789        for i in 0..n {
3790            let g = gradients[i];
3791
3792            // Update biased first moment estimate
3793            m[i] = self.beta1 * m[i] + (1.0 - self.beta1) * g;
3794
3795            // Update biased second raw moment estimate
3796            v[i] = self.beta2 * v[i] + (1.0 - self.beta2) * g * g;
3797
3798            // Update parameters
3799            params[i] -= lr_t * m[i] / (v[i].sqrt() + self.epsilon);
3800        }
3801    }
3802
3803    /// Resets the optimizer state (moment estimates and step counter).
3804    ///
3805    /// Call this when starting training on a new model or after significant
3806    /// changes to the optimization problem.
3807    ///
3808    /// # Example
3809    ///
3810    /// ```
3811    /// use aprender::optim::Adam;
3812    /// use aprender::primitives::Vector;
3813    ///
3814    /// let mut optimizer = Adam::new(0.001);
3815    /// let mut params = Vector::from_slice(&[1.0]);
3816    /// let gradients = Vector::from_slice(&[1.0]);
3817    ///
3818    /// optimizer.step(&mut params, &gradients);
3819    /// assert_eq!(optimizer.steps(), 1);
3820    ///
3821    /// optimizer.reset();
3822    /// assert_eq!(optimizer.steps(), 0);
3823    /// ```
3824    pub fn reset(&mut self) {
3825        self.m = None;
3826        self.v = None;
3827        self.t = 0;
3828    }
3829}
3830
3831impl Optimizer for Adam {
3832    fn step(&mut self, params: &mut Vector<f32>, gradients: &Vector<f32>) {
3833        self.step(params, gradients);
3834    }
3835
3836    fn reset(&mut self) {
3837        self.reset();
3838    }
3839}
3840
3841/// Unified trait for both stochastic and batch optimizers.
3842///
3843/// This trait supports two modes of optimization:
3844///
3845/// 1. **Stochastic mode** (`step`): For mini-batch training with SGD, Adam, etc.
3846/// 2. **Batch mode** (`minimize`): For full-dataset optimization with L-BFGS, CG, etc.
3847///
3848/// # Type Safety
3849///
3850/// The compiler prevents misuse:
3851/// - L-BFGS cannot be used with `step()` (would give poor results with stochastic gradients)
3852/// - SGD/Adam don't implement `minimize()` (inefficient for full datasets)
3853///
3854/// # Example
3855///
3856/// ```
3857/// use aprender::optim::{Optimizer, SGD};
3858/// use aprender::primitives::Vector;
3859///
3860/// // Stochastic mode (mini-batch training)
3861/// let mut adam = SGD::new(0.01);
3862/// let mut params = Vector::from_slice(&[1.0, 2.0]);
3863/// let grad = Vector::from_slice(&[0.1, 0.2]);
3864/// adam.step(&mut params, &grad);
3865/// ```
3866pub trait Optimizer {
3867    /// Stochastic update (mini-batch mode) - for SGD, Adam, `RMSprop`.
3868    ///
3869    /// Updates parameters in-place given gradient from current mini-batch.
3870    /// Used in ML training loops where gradients come from different data batches.
3871    ///
3872    /// # Arguments
3873    ///
3874    /// * `params` - Parameter vector to update (modified in-place)
3875    /// * `gradients` - Gradient vector from current mini-batch
3876    ///
3877    /// # Example
3878    ///
3879    /// ```
3880    /// use aprender::optim::{Optimizer, SGD};
3881    /// use aprender::primitives::Vector;
3882    ///
3883    /// let mut optimizer = SGD::new(0.1);
3884    /// let mut params = Vector::from_slice(&[1.0, 2.0]);
3885    ///
3886    /// for _epoch in 0..10 {
3887    ///     let grad = Vector::from_slice(&[0.1, 0.2]); // From mini-batch
3888    ///     optimizer.step(&mut params, &grad);
3889    /// }
3890    /// ```
3891    fn step(&mut self, params: &mut Vector<f32>, gradients: &Vector<f32>);
3892
3893    /// Batch optimization (deterministic mode) - for L-BFGS, CG, Damped Newton.
3894    ///
3895    /// Minimizes objective function with full dataset access.
3896    /// Returns complete optimization trajectory and convergence info.
3897    ///
3898    /// **Default implementation**: Not all optimizers support batch mode. Stochastic
3899    /// optimizers (SGD, Adam) will panic if you call this method.
3900    ///
3901    /// # Arguments
3902    ///
3903    /// * `objective` - Objective function f: ℝⁿ → ℝ
3904    /// * `gradient` - Gradient function ∇f: ℝⁿ → ℝⁿ
3905    /// * `x0` - Initial point
3906    ///
3907    /// # Returns
3908    ///
3909    /// [`OptimizationResult`] with solution, convergence status, and diagnostics.
3910    ///
3911    /// # Example
3912    ///
3913    /// ```ignore
3914    /// use aprender::optim::{Optimizer, LBFGS};
3915    /// use aprender::primitives::Vector;
3916    ///
3917    /// let mut optimizer = LBFGS::new(100, 1e-5, 10);
3918    ///
3919    /// let objective = |x: &Vector<f32>| (x[0] - 5.0).powi(2);
3920    /// let gradient = |x: &Vector<f32>| Vector::from_slice(&[2.0 * (x[0] - 5.0)]);
3921    ///
3922    /// let result = optimizer.minimize(objective, gradient, Vector::from_slice(&[0.0]));
3923    /// assert_eq!(result.status, ConvergenceStatus::Converged);
3924    /// ```
3925    fn minimize<F, G>(
3926        &mut self,
3927        _objective: F,
3928        _gradient: G,
3929        _x0: Vector<f32>,
3930    ) -> OptimizationResult
3931    where
3932        F: Fn(&Vector<f32>) -> f32,
3933        G: Fn(&Vector<f32>) -> Vector<f32>,
3934    {
3935        unimplemented!(
3936            "{} does not support batch optimization (minimize). Use step() for stochastic updates.",
3937            std::any::type_name::<Self>()
3938        )
3939    }
3940
3941    /// Resets the optimizer state (momentum, history, etc.).
3942    ///
3943    /// Call this when starting training on a new model or after significant
3944    /// changes to the optimization problem.
3945    fn reset(&mut self);
3946}
3947
3948impl Optimizer for SGD {
3949    fn step(&mut self, params: &mut Vector<f32>, gradients: &Vector<f32>) {
3950        self.step(params, gradients);
3951    }
3952
3953    fn reset(&mut self) {
3954        self.reset();
3955    }
3956}
3957
3958#[cfg(test)]
3959#[allow(non_snake_case)] // Allow mathematical matrix notation (A, B, Q, etc.)
3960mod tests {
3961    use super::*;
3962
3963    // ==================== SafeCholesky Tests ====================
3964
3965    #[test]
3966    fn test_safe_cholesky_solve_positive_definite() {
3967        // Well-conditioned positive definite matrix
3968        let A = Matrix::from_vec(2, 2, vec![4.0, 2.0, 2.0, 3.0]).expect("valid dimensions");
3969        let b = Vector::from_slice(&[6.0, 5.0]);
3970
3971        let x = safe_cholesky_solve(&A, &b, 1e-8, 10).expect("should solve");
3972        assert_eq!(x.len(), 2);
3973
3974        // Verify solution: Ax should equal b (approximately)
3975        let Ax = Vector::from_slice(&[
3976            A.get(0, 0) * x[0] + A.get(0, 1) * x[1],
3977            A.get(1, 0) * x[0] + A.get(1, 1) * x[1],
3978        ]);
3979        assert!((Ax[0] - b[0]).abs() < 1e-5);
3980        assert!((Ax[1] - b[1]).abs() < 1e-5);
3981    }
3982
3983    #[test]
3984    fn test_safe_cholesky_solve_identity() {
3985        // Identity matrix - should solve without regularization
3986        let A = Matrix::eye(3);
3987        let b = Vector::from_slice(&[1.0, 2.0, 3.0]);
3988
3989        let x = safe_cholesky_solve(&A, &b, 1e-8, 10).expect("should solve");
3990
3991        // For identity matrix, x should equal b
3992        assert!((x[0] - 1.0).abs() < 1e-6);
3993        assert!((x[1] - 2.0).abs() < 1e-6);
3994        assert!((x[2] - 3.0).abs() < 1e-6);
3995    }
3996
3997    #[test]
3998    fn test_safe_cholesky_solve_ill_conditioned() {
3999        // Ill-conditioned but solvable with regularization
4000        let A = Matrix::from_vec(2, 2, vec![1.0, 0.0, 0.0, 1e-10]).expect("valid dimensions");
4001        let b = Vector::from_slice(&[1.0, 1.0]);
4002
4003        // Should succeed with regularization
4004        let x = safe_cholesky_solve(&A, &b, 1e-8, 10).expect("should solve with regularization");
4005        assert_eq!(x.len(), 2);
4006
4007        // First component should be close to 1.0
4008        assert!((x[0] - 1.0).abs() < 1e-3);
4009    }
4010
4011    #[test]
4012    fn test_safe_cholesky_solve_not_positive_definite() {
4013        // Matrix with negative eigenvalue - needs regularization
4014        let A = Matrix::from_vec(2, 2, vec![1.0, 0.0, 0.0, -0.5]).expect("valid dimensions");
4015        let b = Vector::from_slice(&[1.0, 1.0]);
4016
4017        // Should solve with enough regularization
4018        let result = safe_cholesky_solve(&A, &b, 1e-4, 10);
4019
4020        // May succeed with regularization or fail gracefully
4021        if let Ok(x) = result {
4022            assert_eq!(x.len(), 2);
4023            // Solution exists with regularization
4024        } else {
4025            // Also acceptable - matrix is indefinite
4026        }
4027    }
4028
4029    #[test]
4030    fn test_safe_cholesky_solve_zero_matrix() {
4031        // Zero matrix - should fail even with regularization
4032        let A = Matrix::from_vec(2, 2, vec![0.0, 0.0, 0.0, 0.0]).expect("valid dimensions");
4033        let b = Vector::from_slice(&[1.0, 1.0]);
4034
4035        // Should eventually succeed when regularization dominates
4036        let result = safe_cholesky_solve(&A, &b, 1e-4, 10);
4037        assert!(result.is_ok()); // Regularization makes it λI which is PD
4038    }
4039
4040    #[test]
4041    fn test_safe_cholesky_solve_small_initial_lambda() {
4042        // Test with very small initial lambda
4043        let A = Matrix::eye(2);
4044        let b = Vector::from_slice(&[1.0, 1.0]);
4045
4046        let x = safe_cholesky_solve(&A, &b, 1e-12, 10).expect("should solve");
4047
4048        // Should still work for identity matrix
4049        assert!((x[0] - 1.0).abs() < 1e-6);
4050        assert!((x[1] - 1.0).abs() < 1e-6);
4051    }
4052
4053    #[test]
4054    fn test_safe_cholesky_solve_max_attempts() {
4055        // Test that max_attempts is respected
4056        let A = Matrix::from_vec(2, 2, vec![1.0, 0.0, 0.0, 1.0]).expect("valid dimensions");
4057        let b = Vector::from_slice(&[1.0, 1.0]);
4058
4059        // Even with 1 attempt, should work for identity
4060        let x = safe_cholesky_solve(&A, &b, 1e-8, 1).expect("should solve");
4061        assert_eq!(x.len(), 2);
4062    }
4063
4064    #[test]
4065    fn test_safe_cholesky_solve_large_system() {
4066        // Test with larger system
4067        let n = 5;
4068        let mut data = vec![0.0; n * n];
4069        for i in 0..n {
4070            data[i * n + i] = 2.0; // Diagonal
4071            if i > 0 {
4072                data[i * n + (i - 1)] = 1.0; // Sub-diagonal
4073                data[(i - 1) * n + i] = 1.0; // Super-diagonal
4074            }
4075        }
4076        let A = Matrix::from_vec(n, n, data).expect("valid dimensions");
4077        let b = Vector::from_slice(&[1.0, 1.0, 1.0, 1.0, 1.0]);
4078
4079        let x = safe_cholesky_solve(&A, &b, 1e-8, 10).expect("should solve");
4080        assert_eq!(x.len(), 5);
4081    }
4082
4083    #[test]
4084    fn test_safe_cholesky_solve_symmetric() {
4085        // Verify it works with symmetric matrix
4086        let A = Matrix::from_vec(3, 3, vec![2.0, 1.0, 0.0, 1.0, 2.0, 1.0, 0.0, 1.0, 2.0])
4087            .expect("valid dimensions");
4088        let b = Vector::from_slice(&[1.0, 2.0, 1.0]);
4089
4090        let x = safe_cholesky_solve(&A, &b, 1e-8, 10).expect("should solve");
4091        assert_eq!(x.len(), 3);
4092    }
4093
4094    #[test]
4095    fn test_safe_cholesky_solve_lambda_escalation() {
4096        // Test that lambda increases when needed
4097        // This matrix might need several regularization attempts
4098        let A = Matrix::from_vec(2, 2, vec![1.0, 0.999, 0.999, 1.0]).expect("valid dimensions");
4099        let b = Vector::from_slice(&[1.0, 1.0]);
4100
4101        let x = safe_cholesky_solve(&A, &b, 1e-10, 15).expect("should solve");
4102        assert_eq!(x.len(), 2);
4103
4104        // Solution should exist
4105        assert!(x[0].is_finite());
4106        assert!(x[1].is_finite());
4107    }
4108
4109    // ==================== SGD Tests ====================
4110
4111    #[test]
4112    fn test_sgd_new() {
4113        let optimizer = SGD::new(0.01);
4114        assert!((optimizer.learning_rate() - 0.01).abs() < 1e-6);
4115        assert!((optimizer.momentum() - 0.0).abs() < 1e-6);
4116        assert!(!optimizer.has_momentum());
4117    }
4118
4119    #[test]
4120    fn test_sgd_with_momentum() {
4121        let optimizer = SGD::new(0.01).with_momentum(0.9);
4122        assert!((optimizer.learning_rate() - 0.01).abs() < 1e-6);
4123        assert!((optimizer.momentum() - 0.9).abs() < 1e-6);
4124        assert!(optimizer.has_momentum());
4125    }
4126
4127    #[test]
4128    fn test_sgd_step_basic() {
4129        let mut optimizer = SGD::new(0.1);
4130        let mut params = Vector::from_slice(&[1.0, 2.0, 3.0]);
4131        let gradients = Vector::from_slice(&[1.0, 2.0, 3.0]);
4132
4133        optimizer.step(&mut params, &gradients);
4134
4135        // params = params - lr * gradients
4136        assert!((params[0] - 0.9).abs() < 1e-6);
4137        assert!((params[1] - 1.8).abs() < 1e-6);
4138        assert!((params[2] - 2.7).abs() < 1e-6);
4139    }
4140
4141    #[test]
4142    fn test_sgd_step_with_momentum() {
4143        let mut optimizer = SGD::new(0.1).with_momentum(0.9);
4144        let mut params = Vector::from_slice(&[1.0, 1.0]);
4145        let gradients = Vector::from_slice(&[1.0, 1.0]);
4146
4147        // First step: v = 0.9*0 + 0.1*1 = 0.1, params = 1.0 - 0.1 = 0.9
4148        optimizer.step(&mut params, &gradients);
4149        assert!((params[0] - 0.9).abs() < 1e-6);
4150
4151        // Second step: v = 0.9*0.1 + 0.1*1 = 0.19, params = 0.9 - 0.19 = 0.71
4152        optimizer.step(&mut params, &gradients);
4153        assert!((params[0] - 0.71).abs() < 1e-6);
4154    }
4155
4156    #[test]
4157    fn test_sgd_momentum_accumulation() {
4158        let mut optimizer = SGD::new(0.1).with_momentum(0.9);
4159        let mut params = Vector::from_slice(&[0.0]);
4160        let gradients = Vector::from_slice(&[1.0]);
4161
4162        // Velocity should accumulate over iterations
4163        let mut prev_step = 0.0;
4164        for _ in 0..10 {
4165            let before = params[0];
4166            optimizer.step(&mut params, &gradients);
4167            let step = before - params[0];
4168            // Each step should be larger (velocity builds up)
4169            assert!(step >= prev_step);
4170            prev_step = step;
4171        }
4172    }
4173
4174    #[test]
4175    fn test_sgd_reset() {
4176        let mut optimizer = SGD::new(0.1).with_momentum(0.9);
4177        let mut params = Vector::from_slice(&[1.0]);
4178        let gradients = Vector::from_slice(&[1.0]);
4179
4180        optimizer.step(&mut params, &gradients);
4181        optimizer.reset();
4182
4183        // After reset, velocity should be zero again
4184        let mut params2 = Vector::from_slice(&[1.0]);
4185        optimizer.step(&mut params2, &gradients);
4186        assert!((params2[0] - 0.9).abs() < 1e-6);
4187    }
4188
4189    #[test]
4190    fn test_sgd_zero_gradient() {
4191        let mut optimizer = SGD::new(0.1);
4192        let mut params = Vector::from_slice(&[1.0, 2.0]);
4193        let gradients = Vector::from_slice(&[0.0, 0.0]);
4194
4195        optimizer.step(&mut params, &gradients);
4196
4197        // No change with zero gradients
4198        assert!((params[0] - 1.0).abs() < 1e-6);
4199        assert!((params[1] - 2.0).abs() < 1e-6);
4200    }
4201
4202    #[test]
4203    fn test_sgd_negative_gradients() {
4204        let mut optimizer = SGD::new(0.1);
4205        let mut params = Vector::from_slice(&[1.0]);
4206        let gradients = Vector::from_slice(&[-1.0]);
4207
4208        optimizer.step(&mut params, &gradients);
4209
4210        // params = 1.0 - 0.1 * (-1.0) = 1.1
4211        assert!((params[0] - 1.1).abs() < 1e-6);
4212    }
4213
4214    #[test]
4215    #[should_panic(expected = "same length")]
4216    fn test_sgd_mismatched_lengths() {
4217        let mut optimizer = SGD::new(0.1);
4218        let mut params = Vector::from_slice(&[1.0, 2.0]);
4219        let gradients = Vector::from_slice(&[1.0]);
4220
4221        optimizer.step(&mut params, &gradients);
4222    }
4223
4224    #[test]
4225    fn test_sgd_large_learning_rate() {
4226        let mut optimizer = SGD::new(10.0);
4227        let mut params = Vector::from_slice(&[1.0]);
4228        let gradients = Vector::from_slice(&[0.1]);
4229
4230        optimizer.step(&mut params, &gradients);
4231
4232        // params = 1.0 - 10.0 * 0.1 = 0.0
4233        assert!((params[0] - 0.0).abs() < 1e-6);
4234    }
4235
4236    #[test]
4237    fn test_sgd_small_learning_rate() {
4238        let mut optimizer = SGD::new(0.001);
4239        let mut params = Vector::from_slice(&[1.0]);
4240        let gradients = Vector::from_slice(&[1.0]);
4241
4242        optimizer.step(&mut params, &gradients);
4243
4244        // params = 1.0 - 0.001 * 1.0 = 0.999
4245        assert!((params[0] - 0.999).abs() < 1e-6);
4246    }
4247
4248    #[test]
4249    fn test_sgd_clone() {
4250        let optimizer = SGD::new(0.01).with_momentum(0.9);
4251        let cloned = optimizer.clone();
4252
4253        assert!((cloned.learning_rate() - optimizer.learning_rate()).abs() < 1e-6);
4254        assert!((cloned.momentum() - optimizer.momentum()).abs() < 1e-6);
4255    }
4256
4257    #[test]
4258    fn test_sgd_multiple_steps() {
4259        let mut optimizer = SGD::new(0.1);
4260        let mut params = Vector::from_slice(&[10.0]);
4261        let gradients = Vector::from_slice(&[1.0]);
4262
4263        for _ in 0..10 {
4264            optimizer.step(&mut params, &gradients);
4265        }
4266
4267        // params = 10.0 - 10 * 0.1 * 1.0 = 9.0
4268        assert!((params[0] - 9.0).abs() < 1e-4);
4269    }
4270
4271    #[test]
4272    fn test_sgd_velocity_reinitialization() {
4273        let mut optimizer = SGD::new(0.1).with_momentum(0.9);
4274
4275        // First with 2 params
4276        let mut params = Vector::from_slice(&[1.0, 2.0]);
4277        let gradients = Vector::from_slice(&[1.0, 1.0]);
4278        optimizer.step(&mut params, &gradients);
4279
4280        // Now with 3 params - velocity should reinitialize
4281        let mut params3 = Vector::from_slice(&[1.0, 2.0, 3.0]);
4282        let gradients3 = Vector::from_slice(&[1.0, 1.0, 1.0]);
4283        optimizer.step(&mut params3, &gradients3);
4284
4285        // Should work without error, velocity reinitialized
4286        assert!((params3[0] - 0.9).abs() < 1e-6);
4287    }
4288
4289    // ==================== Adam Tests ====================
4290
4291    #[test]
4292    fn test_adam_new() {
4293        let optimizer = Adam::new(0.001);
4294        assert!((optimizer.learning_rate() - 0.001).abs() < 1e-9);
4295        assert!((optimizer.beta1() - 0.9).abs() < 1e-9);
4296        assert!((optimizer.beta2() - 0.999).abs() < 1e-9);
4297        assert!((optimizer.epsilon() - 1e-8).abs() < 1e-15);
4298        assert_eq!(optimizer.steps(), 0);
4299    }
4300
4301    #[test]
4302    fn test_adam_with_beta1() {
4303        let optimizer = Adam::new(0.001).with_beta1(0.95);
4304        assert!((optimizer.beta1() - 0.95).abs() < 1e-9);
4305    }
4306
4307    #[test]
4308    fn test_adam_with_beta2() {
4309        let optimizer = Adam::new(0.001).with_beta2(0.9999);
4310        assert!((optimizer.beta2() - 0.9999).abs() < 1e-9);
4311    }
4312
4313    #[test]
4314    fn test_adam_with_epsilon() {
4315        let optimizer = Adam::new(0.001).with_epsilon(1e-7);
4316        assert!((optimizer.epsilon() - 1e-7).abs() < 1e-15);
4317    }
4318
4319    #[test]
4320    fn test_adam_step_basic() {
4321        let mut optimizer = Adam::new(0.001);
4322        let mut params = Vector::from_slice(&[1.0, 2.0]);
4323        let gradients = Vector::from_slice(&[0.1, 0.2]);
4324
4325        optimizer.step(&mut params, &gradients);
4326
4327        // Adam should update parameters (exact values depend on bias correction)
4328        assert!(params[0] < 1.0); // Should decrease
4329        assert!(params[1] < 2.0); // Should decrease
4330        assert_eq!(optimizer.steps(), 1);
4331    }
4332
4333    #[test]
4334    fn test_adam_multiple_steps() {
4335        let mut optimizer = Adam::new(0.001);
4336        let mut params = Vector::from_slice(&[1.0]);
4337        let gradients = Vector::from_slice(&[1.0]);
4338
4339        let initial = params[0];
4340        for _ in 0..5 {
4341            optimizer.step(&mut params, &gradients);
4342        }
4343
4344        // Parameters should decrease over multiple steps
4345        assert!(params[0] < initial);
4346        assert_eq!(optimizer.steps(), 5);
4347    }
4348
4349    #[test]
4350    fn test_adam_bias_correction() {
4351        let mut optimizer = Adam::new(0.01);
4352        let mut params = Vector::from_slice(&[10.0]);
4353        let gradients = Vector::from_slice(&[1.0]);
4354
4355        // First step should have larger effective learning rate due to bias correction
4356        optimizer.step(&mut params, &gradients);
4357        let first_step_size = 10.0 - params[0];
4358
4359        // Reset and try second step
4360        let mut optimizer2 = Adam::new(0.01);
4361        let mut params2 = Vector::from_slice(&[10.0]);
4362        optimizer2.step(&mut params2, &gradients);
4363        optimizer2.step(&mut params2, &gradients);
4364        let second_step_size = params[0] - params2[0];
4365
4366        // First step should have larger update due to bias correction
4367        assert!(first_step_size > second_step_size * 0.5);
4368    }
4369
4370    #[test]
4371    fn test_adam_reset() {
4372        let mut optimizer = Adam::new(0.001);
4373        let mut params = Vector::from_slice(&[1.0]);
4374        let gradients = Vector::from_slice(&[1.0]);
4375
4376        optimizer.step(&mut params, &gradients);
4377        assert_eq!(optimizer.steps(), 1);
4378
4379        optimizer.reset();
4380        assert_eq!(optimizer.steps(), 0);
4381    }
4382
4383    #[test]
4384    fn test_adam_zero_gradient() {
4385        let mut optimizer = Adam::new(0.001);
4386        let mut params = Vector::from_slice(&[1.0, 2.0]);
4387        let gradients = Vector::from_slice(&[0.0, 0.0]);
4388
4389        optimizer.step(&mut params, &gradients);
4390
4391        // With zero gradients, params should not change significantly
4392        assert!((params[0] - 1.0).abs() < 0.01);
4393        assert!((params[1] - 2.0).abs() < 0.01);
4394    }
4395
4396    #[test]
4397    fn test_adam_negative_gradients() {
4398        let mut optimizer = Adam::new(0.001);
4399        let mut params = Vector::from_slice(&[1.0]);
4400        let gradients = Vector::from_slice(&[-1.0]);
4401
4402        optimizer.step(&mut params, &gradients);
4403
4404        // With negative gradient, params should increase
4405        assert!(params[0] > 1.0);
4406    }
4407
4408    #[test]
4409    #[should_panic(expected = "same length")]
4410    fn test_adam_mismatched_lengths() {
4411        let mut optimizer = Adam::new(0.001);
4412        let mut params = Vector::from_slice(&[1.0, 2.0]);
4413        let gradients = Vector::from_slice(&[1.0]);
4414
4415        optimizer.step(&mut params, &gradients);
4416    }
4417
4418    #[test]
4419    fn test_adam_clone() {
4420        let optimizer = Adam::new(0.001).with_beta1(0.95).with_beta2(0.9999);
4421        let cloned = optimizer.clone();
4422
4423        assert!((cloned.learning_rate() - optimizer.learning_rate()).abs() < 1e-9);
4424        assert!((cloned.beta1() - optimizer.beta1()).abs() < 1e-9);
4425        assert!((cloned.beta2() - optimizer.beta2()).abs() < 1e-9);
4426        assert!((cloned.epsilon() - optimizer.epsilon()).abs() < 1e-15);
4427    }
4428
4429    #[test]
4430    fn test_adam_adaptive_learning() {
4431        // Test that Adam adapts to gradient magnitudes
4432        let mut optimizer = Adam::new(0.01);
4433        let mut params = Vector::from_slice(&[1.0, 1.0]);
4434
4435        // Large gradient on first param, small on second
4436        let gradients1 = Vector::from_slice(&[10.0, 0.1]);
4437        optimizer.step(&mut params, &gradients1);
4438
4439        let step1_0 = 1.0 - params[0];
4440        let step1_1 = 1.0 - params[1];
4441
4442        // Continue with same gradients
4443        optimizer.step(&mut params, &gradients1);
4444
4445        // Adam should adapt - second param should take relatively larger steps
4446        // because it has more consistent small gradients
4447        assert!(step1_0 > 0.0);
4448        assert!(step1_1 > 0.0);
4449    }
4450
4451    #[test]
4452    fn test_adam_vs_sgd_behavior() {
4453        // Test that Adam and SGD behave differently (not necessarily one better)
4454        let mut adam = Adam::new(0.001);
4455        let mut sgd = SGD::new(0.1);
4456
4457        let mut params_adam = Vector::from_slice(&[5.0]);
4458        let mut params_sgd = Vector::from_slice(&[5.0]);
4459
4460        // Gradient pointing towards 0
4461        for _ in 0..10 {
4462            let gradients = Vector::from_slice(&[1.0]);
4463            adam.step(&mut params_adam, &gradients);
4464            sgd.step(&mut params_sgd, &gradients);
4465        }
4466
4467        // Both should decrease but behave differently
4468        assert!(params_adam[0] < 5.0);
4469        assert!(params_sgd[0] < 5.0);
4470        // They should produce different results due to different mechanisms
4471        assert!((params_adam[0] - params_sgd[0]).abs() > 0.01);
4472    }
4473
4474    #[test]
4475    fn test_adam_moment_initialization() {
4476        let mut optimizer = Adam::new(0.001);
4477
4478        // First with 2 params
4479        let mut params = Vector::from_slice(&[1.0, 2.0]);
4480        let gradients = Vector::from_slice(&[0.1, 0.2]);
4481        optimizer.step(&mut params, &gradients);
4482
4483        // Now with 3 params - moments should reinitialize
4484        let mut params3 = Vector::from_slice(&[1.0, 2.0, 3.0]);
4485        let gradients3 = Vector::from_slice(&[0.1, 0.2, 0.3]);
4486        optimizer.step(&mut params3, &gradients3);
4487
4488        // Should work without error
4489        assert!(params3[0] < 1.0);
4490        assert!(params3[1] < 2.0);
4491        assert!(params3[2] < 3.0);
4492    }
4493
4494    #[test]
4495    fn test_adam_numerical_stability() {
4496        let mut optimizer = Adam::new(0.001);
4497        let mut params = Vector::from_slice(&[1.0]);
4498
4499        // Very large gradients should be handled stably
4500        let gradients = Vector::from_slice(&[1000.0]);
4501        optimizer.step(&mut params, &gradients);
4502
4503        // Should not produce NaN or extreme values
4504        assert!(!params[0].is_nan());
4505        assert!(params[0].is_finite());
4506    }
4507
4508    // ==================== Line Search Tests ====================
4509
4510    #[test]
4511    fn test_backtracking_line_search_new() {
4512        let ls = BacktrackingLineSearch::new(1e-4, 0.5, 50);
4513        assert!((ls.c1 - 1e-4).abs() < 1e-10);
4514        assert!((ls.rho - 0.5).abs() < 1e-10);
4515        assert_eq!(ls.max_iter, 50);
4516    }
4517
4518    #[test]
4519    fn test_backtracking_line_search_default() {
4520        let ls = BacktrackingLineSearch::default();
4521        assert!((ls.c1 - 1e-4).abs() < 1e-10);
4522        assert!((ls.rho - 0.5).abs() < 1e-10);
4523        assert_eq!(ls.max_iter, 50);
4524    }
4525
4526    #[test]
4527    fn test_backtracking_line_search_quadratic() {
4528        // Test on simple quadratic: f(x) = x^2
4529        let f = |x: &Vector<f32>| x[0] * x[0];
4530        let grad = |x: &Vector<f32>| Vector::from_slice(&[2.0 * x[0]]);
4531
4532        let ls = BacktrackingLineSearch::default();
4533        let x = Vector::from_slice(&[2.0]);
4534        let d = Vector::from_slice(&[-4.0]); // Gradient direction at x=2
4535
4536        let alpha = ls.search(&f, &grad, &x, &d);
4537
4538        // Should find positive step size
4539        assert!(alpha > 0.0);
4540        assert!(alpha <= 1.0);
4541
4542        // Verify Armijo condition is satisfied
4543        let x_new_data = x[0] + alpha * d[0];
4544        let x_new = Vector::from_slice(&[x_new_data]);
4545        let fx = f(&x);
4546        let fx_new = f(&x_new);
4547        let grad_x = grad(&x);
4548        let dir_deriv = grad_x[0] * d[0];
4549
4550        assert!(fx_new <= fx + ls.c1 * alpha * dir_deriv);
4551    }
4552
4553    #[test]
4554    fn test_backtracking_line_search_rosenbrock() {
4555        // Rosenbrock function: f(x,y) = (1-x)^2 + 100(y-x^2)^2
4556        let f = |v: &Vector<f32>| {
4557            let x = v[0];
4558            let y = v[1];
4559            (1.0 - x).powi(2) + 100.0 * (y - x * x).powi(2)
4560        };
4561
4562        let grad = |v: &Vector<f32>| {
4563            let x = v[0];
4564            let y = v[1];
4565            let dx = -2.0 * (1.0 - x) - 400.0 * x * (y - x * x);
4566            let dy = 200.0 * (y - x * x);
4567            Vector::from_slice(&[dx, dy])
4568        };
4569
4570        let ls = BacktrackingLineSearch::default();
4571        let x = Vector::from_slice(&[0.0, 0.0]);
4572        let g = grad(&x);
4573        let d = Vector::from_slice(&[-g[0], -g[1]]); // Descent direction
4574
4575        let alpha = ls.search(&f, &grad, &x, &d);
4576
4577        assert!(alpha > 0.0);
4578    }
4579
4580    #[test]
4581    fn test_backtracking_line_search_multidimensional() {
4582        // f(x) = ||x||^2
4583        let f = |x: &Vector<f32>| {
4584            let mut sum = 0.0;
4585            for i in 0..x.len() {
4586                sum += x[i] * x[i];
4587            }
4588            sum
4589        };
4590
4591        let grad = |x: &Vector<f32>| {
4592            let mut g = Vector::zeros(x.len());
4593            for i in 0..x.len() {
4594                g[i] = 2.0 * x[i];
4595            }
4596            g
4597        };
4598
4599        let ls = BacktrackingLineSearch::default();
4600        let x = Vector::from_slice(&[1.0, 2.0, 3.0]);
4601        let g = grad(&x);
4602        let d = Vector::from_slice(&[-g[0], -g[1], -g[2]]);
4603
4604        let alpha = ls.search(&f, &grad, &x, &d);
4605
4606        assert!(alpha > 0.0);
4607        assert!(alpha <= 1.0);
4608    }
4609
4610    #[test]
4611    fn test_wolfe_line_search_new() {
4612        let ls = WolfeLineSearch::new(1e-4, 0.9, 50);
4613        assert!((ls.c1 - 1e-4).abs() < 1e-10);
4614        assert!((ls.c2 - 0.9).abs() < 1e-10);
4615        assert_eq!(ls.max_iter, 50);
4616    }
4617
4618    #[test]
4619    #[should_panic(expected = "Wolfe conditions require 0 < c1 < c2 < 1")]
4620    fn test_wolfe_line_search_invalid_c1_c2() {
4621        // c1 >= c2 should panic
4622        let _ = WolfeLineSearch::new(0.9, 0.5, 50);
4623    }
4624
4625    #[test]
4626    #[should_panic(expected = "Wolfe conditions require 0 < c1 < c2 < 1")]
4627    fn test_wolfe_line_search_c1_negative() {
4628        let _ = WolfeLineSearch::new(-0.1, 0.9, 50);
4629    }
4630
4631    #[test]
4632    #[should_panic(expected = "Wolfe conditions require 0 < c1 < c2 < 1")]
4633    fn test_wolfe_line_search_c2_too_large() {
4634        let _ = WolfeLineSearch::new(0.1, 1.5, 50);
4635    }
4636
4637    #[test]
4638    fn test_wolfe_line_search_default() {
4639        let ls = WolfeLineSearch::default();
4640        assert!((ls.c1 - 1e-4).abs() < 1e-10);
4641        assert!((ls.c2 - 0.9).abs() < 1e-10);
4642        assert_eq!(ls.max_iter, 50);
4643    }
4644
4645    #[test]
4646    fn test_wolfe_line_search_quadratic() {
4647        // Test on simple quadratic: f(x) = x^2
4648        let f = |x: &Vector<f32>| x[0] * x[0];
4649        let grad = |x: &Vector<f32>| Vector::from_slice(&[2.0 * x[0]]);
4650
4651        let ls = WolfeLineSearch::default();
4652        let x = Vector::from_slice(&[2.0]);
4653        let d = Vector::from_slice(&[-4.0]); // Gradient direction at x=2
4654
4655        let alpha = ls.search(&f, &grad, &x, &d);
4656
4657        // Should find positive step size
4658        assert!(alpha > 0.0);
4659
4660        // Verify both Wolfe conditions
4661        let x_new_data = x[0] + alpha * d[0];
4662        let x_new = Vector::from_slice(&[x_new_data]);
4663        let fx = f(&x);
4664        let fx_new = f(&x_new);
4665        let grad_x = grad(&x);
4666        let grad_new = grad(&x_new);
4667        let dir_deriv = grad_x[0] * d[0];
4668        let dir_deriv_new = grad_new[0] * d[0];
4669
4670        // Armijo condition
4671        assert!(fx_new <= fx + ls.c1 * alpha * dir_deriv + 1e-6);
4672
4673        // Curvature condition
4674        assert!(dir_deriv_new.abs() <= ls.c2 * dir_deriv.abs() + 1e-6);
4675    }
4676
4677    #[test]
4678    fn test_wolfe_line_search_multidimensional() {
4679        // f(x) = ||x||^2
4680        let f = |x: &Vector<f32>| {
4681            let mut sum = 0.0;
4682            for i in 0..x.len() {
4683                sum += x[i] * x[i];
4684            }
4685            sum
4686        };
4687
4688        let grad = |x: &Vector<f32>| {
4689            let mut g = Vector::zeros(x.len());
4690            for i in 0..x.len() {
4691                g[i] = 2.0 * x[i];
4692            }
4693            g
4694        };
4695
4696        let ls = WolfeLineSearch::default();
4697        let x = Vector::from_slice(&[1.0, 2.0, 3.0]);
4698        let g = grad(&x);
4699        let d = Vector::from_slice(&[-g[0], -g[1], -g[2]]);
4700
4701        let alpha = ls.search(&f, &grad, &x, &d);
4702
4703        assert!(alpha > 0.0);
4704    }
4705
4706    #[test]
4707    fn test_backtracking_vs_wolfe() {
4708        // Compare backtracking and Wolfe on same problem
4709        let f = |x: &Vector<f32>| x[0] * x[0] + x[1] * x[1];
4710        let grad = |x: &Vector<f32>| Vector::from_slice(&[2.0 * x[0], 2.0 * x[1]]);
4711
4712        let bt = BacktrackingLineSearch::default();
4713        let wolfe = WolfeLineSearch::default();
4714
4715        let x = Vector::from_slice(&[1.0, 1.0]);
4716        let g = grad(&x);
4717        let d = Vector::from_slice(&[-g[0], -g[1]]);
4718
4719        let alpha_bt = bt.search(&f, &grad, &x, &d);
4720        let alpha_wolfe = wolfe.search(&f, &grad, &x, &d);
4721
4722        // Both should find valid step sizes
4723        assert!(alpha_bt > 0.0);
4724        assert!(alpha_wolfe > 0.0);
4725
4726        // Wolfe often finds larger steps due to curvature condition
4727        // but not always, so just verify both are reasonable
4728        assert!(alpha_bt <= 1.0);
4729    }
4730
4731    // ==================== OptimizationResult Tests ====================
4732
4733    #[test]
4734    fn test_optimization_result_converged() {
4735        let solution = Vector::from_slice(&[1.0, 2.0, 3.0]);
4736        let result = OptimizationResult::converged(solution.clone(), 42);
4737
4738        assert_eq!(result.solution.len(), 3);
4739        assert_eq!(result.iterations, 42);
4740        assert_eq!(result.status, ConvergenceStatus::Converged);
4741        assert!((result.objective_value - 0.0).abs() < 1e-10);
4742        assert!((result.gradient_norm - 0.0).abs() < 1e-10);
4743    }
4744
4745    #[test]
4746    fn test_optimization_result_max_iterations() {
4747        let solution = Vector::from_slice(&[5.0]);
4748        let result = OptimizationResult::max_iterations(solution);
4749
4750        assert_eq!(result.iterations, 0);
4751        assert_eq!(result.status, ConvergenceStatus::MaxIterations);
4752    }
4753
4754    #[test]
4755    fn test_convergence_status_equality() {
4756        assert_eq!(ConvergenceStatus::Converged, ConvergenceStatus::Converged);
4757        assert_ne!(
4758            ConvergenceStatus::Converged,
4759            ConvergenceStatus::MaxIterations
4760        );
4761        assert_ne!(
4762            ConvergenceStatus::Stalled,
4763            ConvergenceStatus::NumericalError
4764        );
4765    }
4766
4767    // ==================== L-BFGS Tests ====================
4768
4769    #[test]
4770    fn test_lbfgs_new() {
4771        let optimizer = LBFGS::new(100, 1e-5, 10);
4772        assert_eq!(optimizer.max_iter, 100);
4773        assert!((optimizer.tol - 1e-5).abs() < 1e-10);
4774        assert_eq!(optimizer.m, 10);
4775        assert_eq!(optimizer.s_history.len(), 0);
4776        assert_eq!(optimizer.y_history.len(), 0);
4777    }
4778
4779    #[test]
4780    fn test_lbfgs_simple_quadratic() {
4781        // Minimize f(x) = x^2, optimal at x = 0
4782        let f = |x: &Vector<f32>| x[0] * x[0];
4783        let grad = |x: &Vector<f32>| Vector::from_slice(&[2.0 * x[0]]);
4784
4785        let mut optimizer = LBFGS::new(100, 1e-5, 5);
4786        let x0 = Vector::from_slice(&[5.0]);
4787        let result = optimizer.minimize(f, grad, x0);
4788
4789        assert_eq!(result.status, ConvergenceStatus::Converged);
4790        assert!(result.solution[0].abs() < 1e-3);
4791        assert!(result.iterations < 100);
4792        assert!(result.gradient_norm < 1e-5);
4793    }
4794
4795    #[test]
4796    fn test_lbfgs_multidimensional_quadratic() {
4797        // Minimize f(x) = ||x - c||^2 where c = [1, 2, 3]
4798        let c = [1.0, 2.0, 3.0];
4799        let f = |x: &Vector<f32>| {
4800            let mut sum = 0.0;
4801            for i in 0..x.len() {
4802                sum += (x[i] - c[i]).powi(2);
4803            }
4804            sum
4805        };
4806
4807        let grad = |x: &Vector<f32>| {
4808            let mut g = Vector::zeros(x.len());
4809            for i in 0..x.len() {
4810                g[i] = 2.0 * (x[i] - c[i]);
4811            }
4812            g
4813        };
4814
4815        let mut optimizer = LBFGS::new(100, 1e-5, 10);
4816        let x0 = Vector::from_slice(&[0.0, 0.0, 0.0]);
4817        let result = optimizer.minimize(f, grad, x0);
4818
4819        assert_eq!(result.status, ConvergenceStatus::Converged);
4820        for (i, &target) in c.iter().enumerate().take(3) {
4821            assert!((result.solution[i] - target).abs() < 1e-3);
4822        }
4823    }
4824
4825    #[test]
4826    fn test_lbfgs_rosenbrock() {
4827        // Rosenbrock function: f(x,y) = (1-x)^2 + 100(y-x^2)^2
4828        // Global minimum at (1, 1)
4829        let f = |v: &Vector<f32>| {
4830            let x = v[0];
4831            let y = v[1];
4832            (1.0 - x).powi(2) + 100.0 * (y - x * x).powi(2)
4833        };
4834
4835        let grad = |v: &Vector<f32>| {
4836            let x = v[0];
4837            let y = v[1];
4838            let dx = -2.0 * (1.0 - x) - 400.0 * x * (y - x * x);
4839            let dy = 200.0 * (y - x * x);
4840            Vector::from_slice(&[dx, dy])
4841        };
4842
4843        let mut optimizer = LBFGS::new(200, 1e-4, 10);
4844        let x0 = Vector::from_slice(&[0.0, 0.0]);
4845        let result = optimizer.minimize(f, grad, x0);
4846
4847        // Should converge close to (1, 1)
4848        assert!(
4849            result.status == ConvergenceStatus::Converged
4850                || result.status == ConvergenceStatus::MaxIterations
4851        );
4852        assert!((result.solution[0] - 1.0).abs() < 0.1);
4853        assert!((result.solution[1] - 1.0).abs() < 0.1);
4854    }
4855
4856    #[test]
4857    fn test_lbfgs_sphere() {
4858        // Sphere function: f(x) = sum(x_i^2)
4859        let f = |x: &Vector<f32>| {
4860            let mut sum = 0.0;
4861            for i in 0..x.len() {
4862                sum += x[i] * x[i];
4863            }
4864            sum
4865        };
4866
4867        let grad = |x: &Vector<f32>| {
4868            let mut g = Vector::zeros(x.len());
4869            for i in 0..x.len() {
4870                g[i] = 2.0 * x[i];
4871            }
4872            g
4873        };
4874
4875        let mut optimizer = LBFGS::new(100, 1e-5, 10);
4876        let x0 = Vector::from_slice(&[5.0, -3.0, 2.0, -1.0]);
4877        let result = optimizer.minimize(f, grad, x0);
4878
4879        assert_eq!(result.status, ConvergenceStatus::Converged);
4880        for i in 0..4 {
4881            assert!(result.solution[i].abs() < 1e-3);
4882        }
4883    }
4884
4885    #[test]
4886    fn test_lbfgs_different_history_sizes() {
4887        let f = |x: &Vector<f32>| x[0] * x[0] + x[1] * x[1];
4888        let grad = |x: &Vector<f32>| Vector::from_slice(&[2.0 * x[0], 2.0 * x[1]]);
4889        let x0 = Vector::from_slice(&[3.0, 4.0]);
4890
4891        // Small history
4892        let mut opt_small = LBFGS::new(100, 1e-5, 3);
4893        let result_small = opt_small.minimize(f, grad, x0.clone());
4894        assert_eq!(result_small.status, ConvergenceStatus::Converged);
4895
4896        // Large history
4897        let mut opt_large = LBFGS::new(100, 1e-5, 20);
4898        let result_large = opt_large.minimize(f, grad, x0);
4899        assert_eq!(result_large.status, ConvergenceStatus::Converged);
4900
4901        // Both should converge to same solution
4902        assert!((result_small.solution[0] - result_large.solution[0]).abs() < 1e-3);
4903        assert!((result_small.solution[1] - result_large.solution[1]).abs() < 1e-3);
4904    }
4905
4906    #[test]
4907    fn test_lbfgs_reset() {
4908        let f = |x: &Vector<f32>| x[0] * x[0];
4909        let grad = |x: &Vector<f32>| Vector::from_slice(&[2.0 * x[0]]);
4910
4911        let mut optimizer = LBFGS::new(100, 1e-5, 10);
4912        let x0 = Vector::from_slice(&[5.0]);
4913
4914        // First run
4915        optimizer.minimize(f, grad, x0.clone());
4916        assert!(!optimizer.s_history.is_empty());
4917
4918        // Reset
4919        optimizer.reset();
4920        assert_eq!(optimizer.s_history.len(), 0);
4921        assert_eq!(optimizer.y_history.len(), 0);
4922
4923        // Second run should work
4924        let result = optimizer.minimize(f, grad, x0);
4925        assert_eq!(result.status, ConvergenceStatus::Converged);
4926    }
4927
4928    #[test]
4929    fn test_lbfgs_max_iterations() {
4930        // Use Rosenbrock with very few iterations to force max_iter
4931        let f = |v: &Vector<f32>| {
4932            let x = v[0];
4933            let y = v[1];
4934            (1.0 - x).powi(2) + 100.0 * (y - x * x).powi(2)
4935        };
4936
4937        let grad = |v: &Vector<f32>| {
4938            let x = v[0];
4939            let y = v[1];
4940            let dx = -2.0 * (1.0 - x) - 400.0 * x * (y - x * x);
4941            let dy = 200.0 * (y - x * x);
4942            Vector::from_slice(&[dx, dy])
4943        };
4944
4945        let mut optimizer = LBFGS::new(3, 1e-10, 5);
4946        let x0 = Vector::from_slice(&[0.0, 0.0]);
4947        let result = optimizer.minimize(f, grad, x0);
4948
4949        // With only 3 iterations, should hit max_iter on Rosenbrock
4950        assert_eq!(result.status, ConvergenceStatus::MaxIterations);
4951        assert_eq!(result.iterations, 3);
4952    }
4953
4954    #[test]
4955    #[should_panic(expected = "does not support stochastic")]
4956    fn test_lbfgs_step_panics() {
4957        let mut optimizer = LBFGS::new(100, 1e-5, 10);
4958        let mut params = Vector::from_slice(&[1.0]);
4959        let grad = Vector::from_slice(&[0.1]);
4960
4961        // Should panic - L-BFGS doesn't support step()
4962        optimizer.step(&mut params, &grad);
4963    }
4964
4965    #[test]
4966    fn test_lbfgs_numerical_error_detection() {
4967        // Function that produces NaN
4968        let f = |x: &Vector<f32>| {
4969            if x[0] < -100.0 {
4970                f32::NAN
4971            } else {
4972                x[0] * x[0]
4973            }
4974        };
4975        let grad = |x: &Vector<f32>| Vector::from_slice(&[2.0 * x[0]]);
4976
4977        let mut optimizer = LBFGS::new(100, 1e-5, 5);
4978        let x0 = Vector::from_slice(&[0.0]);
4979        let result = optimizer.minimize(f, grad, x0);
4980
4981        // Should detect numerical error or converge normally
4982        assert!(
4983            result.status == ConvergenceStatus::Converged
4984                || result.status == ConvergenceStatus::NumericalError
4985                || result.status == ConvergenceStatus::MaxIterations
4986        );
4987    }
4988
4989    #[test]
4990    fn test_lbfgs_computes_elapsed_time() {
4991        let f = |x: &Vector<f32>| x[0] * x[0];
4992        let grad = |x: &Vector<f32>| Vector::from_slice(&[2.0 * x[0]]);
4993
4994        let mut optimizer = LBFGS::new(100, 1e-5, 5);
4995        let x0 = Vector::from_slice(&[5.0]);
4996        let result = optimizer.minimize(f, grad, x0);
4997
4998        // Should have non-zero elapsed time
4999        assert!(result.elapsed_time.as_nanos() > 0);
5000    }
5001
5002    #[test]
5003    fn test_lbfgs_gradient_norm_tracking() {
5004        let f = |x: &Vector<f32>| x[0] * x[0] + x[1] * x[1];
5005        let grad = |x: &Vector<f32>| Vector::from_slice(&[2.0 * x[0], 2.0 * x[1]]);
5006
5007        let mut optimizer = LBFGS::new(100, 1e-5, 10);
5008        let x0 = Vector::from_slice(&[3.0, 4.0]);
5009        let result = optimizer.minimize(f, grad, x0);
5010
5011        // Gradient norm at convergence should be small
5012        if result.status == ConvergenceStatus::Converged {
5013            assert!(result.gradient_norm < 1e-5);
5014        }
5015    }
5016
5017    // ==================== Conjugate Gradient Tests ====================
5018
5019    #[test]
5020    fn test_cg_new_fletcher_reeves() {
5021        let optimizer = ConjugateGradient::new(100, 1e-5, CGBetaFormula::FletcherReeves);
5022        assert_eq!(optimizer.max_iter, 100);
5023        assert!((optimizer.tol - 1e-5).abs() < 1e-10);
5024        assert_eq!(optimizer.beta_formula, CGBetaFormula::FletcherReeves);
5025        assert_eq!(optimizer.restart_interval, 0);
5026    }
5027
5028    #[test]
5029    fn test_cg_new_polak_ribiere() {
5030        let optimizer = ConjugateGradient::new(100, 1e-5, CGBetaFormula::PolakRibiere);
5031        assert_eq!(optimizer.beta_formula, CGBetaFormula::PolakRibiere);
5032    }
5033
5034    #[test]
5035    fn test_cg_new_hestenes_stiefel() {
5036        let optimizer = ConjugateGradient::new(100, 1e-5, CGBetaFormula::HestenesStiefel);
5037        assert_eq!(optimizer.beta_formula, CGBetaFormula::HestenesStiefel);
5038    }
5039
5040    #[test]
5041    fn test_cg_with_restart_interval() {
5042        let optimizer = ConjugateGradient::new(100, 1e-5, CGBetaFormula::PolakRibiere)
5043            .with_restart_interval(50);
5044        assert_eq!(optimizer.restart_interval, 50);
5045    }
5046
5047    #[test]
5048    fn test_cg_simple_quadratic_fr() {
5049        // Minimize f(x) = x^2, optimal at x = 0
5050        let f = |x: &Vector<f32>| x[0] * x[0];
5051        let grad = |x: &Vector<f32>| Vector::from_slice(&[2.0 * x[0]]);
5052
5053        let mut optimizer = ConjugateGradient::new(100, 1e-5, CGBetaFormula::FletcherReeves);
5054        let x0 = Vector::from_slice(&[5.0]);
5055        let result = optimizer.minimize(f, grad, x0);
5056
5057        assert_eq!(result.status, ConvergenceStatus::Converged);
5058        assert!(result.solution[0].abs() < 1e-3);
5059    }
5060
5061    #[test]
5062    fn test_cg_simple_quadratic_pr() {
5063        let f = |x: &Vector<f32>| x[0] * x[0];
5064        let grad = |x: &Vector<f32>| Vector::from_slice(&[2.0 * x[0]]);
5065
5066        let mut optimizer = ConjugateGradient::new(100, 1e-5, CGBetaFormula::PolakRibiere);
5067        let x0 = Vector::from_slice(&[5.0]);
5068        let result = optimizer.minimize(f, grad, x0);
5069
5070        assert_eq!(result.status, ConvergenceStatus::Converged);
5071        assert!(result.solution[0].abs() < 1e-3);
5072    }
5073
5074    #[test]
5075    fn test_cg_simple_quadratic_hs() {
5076        let f = |x: &Vector<f32>| x[0] * x[0];
5077        let grad = |x: &Vector<f32>| Vector::from_slice(&[2.0 * x[0]]);
5078
5079        let mut optimizer = ConjugateGradient::new(100, 1e-5, CGBetaFormula::HestenesStiefel);
5080        let x0 = Vector::from_slice(&[5.0]);
5081        let result = optimizer.minimize(f, grad, x0);
5082
5083        assert_eq!(result.status, ConvergenceStatus::Converged);
5084        assert!(result.solution[0].abs() < 1e-3);
5085    }
5086
5087    #[test]
5088    fn test_cg_multidimensional_quadratic() {
5089        // Minimize f(x) = ||x - c||^2 where c = [1, 2, 3]
5090        let c = [1.0, 2.0, 3.0];
5091        let f = |x: &Vector<f32>| {
5092            let mut sum = 0.0;
5093            for i in 0..x.len() {
5094                sum += (x[i] - c[i]).powi(2);
5095            }
5096            sum
5097        };
5098
5099        let grad = |x: &Vector<f32>| {
5100            let mut g = Vector::zeros(x.len());
5101            for i in 0..x.len() {
5102                g[i] = 2.0 * (x[i] - c[i]);
5103            }
5104            g
5105        };
5106
5107        let mut optimizer = ConjugateGradient::new(100, 1e-5, CGBetaFormula::PolakRibiere);
5108        let x0 = Vector::from_slice(&[0.0, 0.0, 0.0]);
5109        let result = optimizer.minimize(f, grad, x0);
5110
5111        assert_eq!(result.status, ConvergenceStatus::Converged);
5112        for (i, &target) in c.iter().enumerate() {
5113            assert!((result.solution[i] - target).abs() < 1e-3);
5114        }
5115    }
5116
5117    #[test]
5118    fn test_cg_rosenbrock() {
5119        // Rosenbrock function: f(x,y) = (1-x)^2 + 100(y-x^2)^2
5120        let f = |v: &Vector<f32>| {
5121            let x = v[0];
5122            let y = v[1];
5123            (1.0 - x).powi(2) + 100.0 * (y - x * x).powi(2)
5124        };
5125
5126        let grad = |v: &Vector<f32>| {
5127            let x = v[0];
5128            let y = v[1];
5129            let dx = -2.0 * (1.0 - x) - 400.0 * x * (y - x * x);
5130            let dy = 200.0 * (y - x * x);
5131            Vector::from_slice(&[dx, dy])
5132        };
5133
5134        let mut optimizer = ConjugateGradient::new(500, 1e-4, CGBetaFormula::PolakRibiere);
5135        let x0 = Vector::from_slice(&[0.0, 0.0]);
5136        let result = optimizer.minimize(f, grad, x0);
5137
5138        // Should converge close to (1, 1)
5139        assert!(
5140            result.status == ConvergenceStatus::Converged
5141                || result.status == ConvergenceStatus::MaxIterations
5142        );
5143        assert!((result.solution[0] - 1.0).abs() < 0.1);
5144        assert!((result.solution[1] - 1.0).abs() < 0.1);
5145    }
5146
5147    #[test]
5148    fn test_cg_sphere() {
5149        // Sphere function: f(x) = sum(x_i^2)
5150        let f = |x: &Vector<f32>| {
5151            let mut sum = 0.0;
5152            for i in 0..x.len() {
5153                sum += x[i] * x[i];
5154            }
5155            sum
5156        };
5157
5158        let grad = |x: &Vector<f32>| {
5159            let mut g = Vector::zeros(x.len());
5160            for i in 0..x.len() {
5161                g[i] = 2.0 * x[i];
5162            }
5163            g
5164        };
5165
5166        let mut optimizer = ConjugateGradient::new(100, 1e-5, CGBetaFormula::PolakRibiere);
5167        let x0 = Vector::from_slice(&[5.0, -3.0, 2.0, -1.0]);
5168        let result = optimizer.minimize(f, grad, x0);
5169
5170        assert_eq!(result.status, ConvergenceStatus::Converged);
5171        for i in 0..4 {
5172            assert!(result.solution[i].abs() < 1e-3);
5173        }
5174    }
5175
5176    #[test]
5177    fn test_cg_compare_beta_formulas() {
5178        // Compare different beta formulas on same problem
5179        let f = |x: &Vector<f32>| x[0] * x[0] + x[1] * x[1];
5180        let grad = |x: &Vector<f32>| Vector::from_slice(&[2.0 * x[0], 2.0 * x[1]]);
5181        let x0 = Vector::from_slice(&[3.0, 4.0]);
5182
5183        let mut opt_fr = ConjugateGradient::new(100, 1e-5, CGBetaFormula::FletcherReeves);
5184        let result_fr = opt_fr.minimize(f, grad, x0.clone());
5185
5186        let mut opt_pr = ConjugateGradient::new(100, 1e-5, CGBetaFormula::PolakRibiere);
5187        let result_pr = opt_pr.minimize(f, grad, x0.clone());
5188
5189        let mut opt_hs = ConjugateGradient::new(100, 1e-5, CGBetaFormula::HestenesStiefel);
5190        let result_hs = opt_hs.minimize(f, grad, x0);
5191
5192        // All should converge to same solution
5193        assert_eq!(result_fr.status, ConvergenceStatus::Converged);
5194        assert_eq!(result_pr.status, ConvergenceStatus::Converged);
5195        assert_eq!(result_hs.status, ConvergenceStatus::Converged);
5196
5197        assert!(result_fr.solution[0].abs() < 1e-3);
5198        assert!(result_pr.solution[0].abs() < 1e-3);
5199        assert!(result_hs.solution[0].abs() < 1e-3);
5200    }
5201
5202    #[test]
5203    fn test_cg_with_periodic_restart() {
5204        let f = |x: &Vector<f32>| x[0] * x[0] + x[1] * x[1];
5205        let grad = |x: &Vector<f32>| Vector::from_slice(&[2.0 * x[0], 2.0 * x[1]]);
5206
5207        let mut optimizer =
5208            ConjugateGradient::new(100, 1e-5, CGBetaFormula::PolakRibiere).with_restart_interval(5);
5209        let x0 = Vector::from_slice(&[5.0, 5.0]);
5210        let result = optimizer.minimize(f, grad, x0);
5211
5212        assert_eq!(result.status, ConvergenceStatus::Converged);
5213        assert!(result.solution[0].abs() < 1e-3);
5214        assert!(result.solution[1].abs() < 1e-3);
5215    }
5216
5217    #[test]
5218    fn test_cg_reset() {
5219        let f = |x: &Vector<f32>| x[0] * x[0];
5220        let grad = |x: &Vector<f32>| Vector::from_slice(&[2.0 * x[0]]);
5221
5222        let mut optimizer = ConjugateGradient::new(100, 1e-5, CGBetaFormula::PolakRibiere);
5223        let x0 = Vector::from_slice(&[5.0]);
5224
5225        // First run
5226        optimizer.minimize(f, grad, x0.clone());
5227        assert!(optimizer.prev_direction.is_some());
5228
5229        // Reset
5230        optimizer.reset();
5231        assert!(optimizer.prev_direction.is_none());
5232        assert!(optimizer.prev_gradient.is_none());
5233        assert_eq!(optimizer.iter_count, 0);
5234
5235        // Second run should work
5236        let result = optimizer.minimize(f, grad, x0);
5237        assert_eq!(result.status, ConvergenceStatus::Converged);
5238    }
5239
5240    #[test]
5241    fn test_cg_max_iterations() {
5242        // Use Rosenbrock with very few iterations
5243        let f = |v: &Vector<f32>| {
5244            let x = v[0];
5245            let y = v[1];
5246            (1.0 - x).powi(2) + 100.0 * (y - x * x).powi(2)
5247        };
5248
5249        let grad = |v: &Vector<f32>| {
5250            let x = v[0];
5251            let y = v[1];
5252            let dx = -2.0 * (1.0 - x) - 400.0 * x * (y - x * x);
5253            let dy = 200.0 * (y - x * x);
5254            Vector::from_slice(&[dx, dy])
5255        };
5256
5257        let mut optimizer = ConjugateGradient::new(3, 1e-10, CGBetaFormula::PolakRibiere);
5258        let x0 = Vector::from_slice(&[0.0, 0.0]);
5259        let result = optimizer.minimize(f, grad, x0);
5260
5261        assert_eq!(result.status, ConvergenceStatus::MaxIterations);
5262        assert_eq!(result.iterations, 3);
5263    }
5264
5265    #[test]
5266    #[should_panic(expected = "does not support stochastic")]
5267    fn test_cg_step_panics() {
5268        let mut optimizer = ConjugateGradient::new(100, 1e-5, CGBetaFormula::PolakRibiere);
5269        let mut params = Vector::from_slice(&[1.0]);
5270        let grad = Vector::from_slice(&[0.1]);
5271
5272        // Should panic - CG doesn't support step()
5273        optimizer.step(&mut params, &grad);
5274    }
5275
5276    #[test]
5277    fn test_cg_numerical_error_detection() {
5278        // Function that produces NaN
5279        let f = |x: &Vector<f32>| {
5280            if x[0] < -100.0 {
5281                f32::NAN
5282            } else {
5283                x[0] * x[0]
5284            }
5285        };
5286        let grad = |x: &Vector<f32>| Vector::from_slice(&[2.0 * x[0]]);
5287
5288        let mut optimizer = ConjugateGradient::new(100, 1e-5, CGBetaFormula::PolakRibiere);
5289        let x0 = Vector::from_slice(&[0.0]);
5290        let result = optimizer.minimize(f, grad, x0);
5291
5292        // Should detect numerical error or converge normally
5293        assert!(
5294            result.status == ConvergenceStatus::Converged
5295                || result.status == ConvergenceStatus::NumericalError
5296                || result.status == ConvergenceStatus::MaxIterations
5297        );
5298    }
5299
5300    #[test]
5301    fn test_cg_gradient_norm_tracking() {
5302        let f = |x: &Vector<f32>| x[0] * x[0] + x[1] * x[1];
5303        let grad = |x: &Vector<f32>| Vector::from_slice(&[2.0 * x[0], 2.0 * x[1]]);
5304
5305        let mut optimizer = ConjugateGradient::new(100, 1e-5, CGBetaFormula::PolakRibiere);
5306        let x0 = Vector::from_slice(&[3.0, 4.0]);
5307        let result = optimizer.minimize(f, grad, x0);
5308
5309        // Gradient norm at convergence should be small
5310        if result.status == ConvergenceStatus::Converged {
5311            assert!(result.gradient_norm < 1e-5);
5312        }
5313    }
5314
5315    #[test]
5316    fn test_cg_vs_lbfgs_quadratic() {
5317        // Compare CG and L-BFGS on a quadratic problem
5318        let f = |x: &Vector<f32>| x[0] * x[0] + 2.0 * x[1] * x[1] + 3.0 * x[2] * x[2];
5319        let grad = |x: &Vector<f32>| Vector::from_slice(&[2.0 * x[0], 4.0 * x[1], 6.0 * x[2]]);
5320        let x0 = Vector::from_slice(&[5.0, 3.0, 2.0]);
5321
5322        let mut cg = ConjugateGradient::new(100, 1e-5, CGBetaFormula::PolakRibiere);
5323        let result_cg = cg.minimize(f, grad, x0.clone());
5324
5325        let mut lbfgs = LBFGS::new(100, 1e-5, 10);
5326        let result_lbfgs = lbfgs.minimize(f, grad, x0);
5327
5328        // Both should converge to same solution
5329        assert_eq!(result_cg.status, ConvergenceStatus::Converged);
5330        assert_eq!(result_lbfgs.status, ConvergenceStatus::Converged);
5331
5332        for i in 0..3 {
5333            assert!((result_cg.solution[i] - result_lbfgs.solution[i]).abs() < 1e-3);
5334        }
5335    }
5336
5337    // ==================== Damped Newton Tests ====================
5338
5339    #[test]
5340    fn test_damped_newton_new() {
5341        let optimizer = DampedNewton::new(100, 1e-5);
5342        assert_eq!(optimizer.max_iter, 100);
5343        assert!((optimizer.tol - 1e-5).abs() < 1e-10);
5344        assert!((optimizer.epsilon - 1e-5).abs() < 1e-10);
5345    }
5346
5347    #[test]
5348    fn test_damped_newton_with_epsilon() {
5349        let optimizer = DampedNewton::new(100, 1e-5).with_epsilon(1e-6);
5350        assert!((optimizer.epsilon - 1e-6).abs() < 1e-12);
5351    }
5352
5353    #[test]
5354    fn test_damped_newton_simple_quadratic() {
5355        // Minimize f(x) = x^2, optimal at x = 0
5356        let f = |x: &Vector<f32>| x[0] * x[0];
5357        let grad = |x: &Vector<f32>| Vector::from_slice(&[2.0 * x[0]]);
5358
5359        let mut optimizer = DampedNewton::new(100, 1e-5);
5360        let x0 = Vector::from_slice(&[5.0]);
5361        let result = optimizer.minimize(f, grad, x0);
5362
5363        assert_eq!(result.status, ConvergenceStatus::Converged);
5364        assert!(result.solution[0].abs() < 1e-3);
5365    }
5366
5367    #[test]
5368    fn test_damped_newton_multidimensional_quadratic() {
5369        // Minimize f(x,y) = x^2 + 2y^2, optimal at (0, 0)
5370        let f = |x: &Vector<f32>| x[0] * x[0] + 2.0 * x[1] * x[1];
5371        let grad = |x: &Vector<f32>| Vector::from_slice(&[2.0 * x[0], 4.0 * x[1]]);
5372
5373        let mut optimizer = DampedNewton::new(100, 1e-5);
5374        let x0 = Vector::from_slice(&[5.0, 3.0]);
5375        let result = optimizer.minimize(f, grad, x0);
5376
5377        assert_eq!(result.status, ConvergenceStatus::Converged);
5378        assert!(result.solution[0].abs() < 1e-3);
5379        assert!(result.solution[1].abs() < 1e-3);
5380    }
5381
5382    #[test]
5383    fn test_damped_newton_rosenbrock() {
5384        // Rosenbrock function: f(x,y) = (1-x)^2 + 100(y-x^2)^2
5385        let f = |v: &Vector<f32>| {
5386            let x = v[0];
5387            let y = v[1];
5388            (1.0 - x).powi(2) + 100.0 * (y - x * x).powi(2)
5389        };
5390
5391        let grad = |v: &Vector<f32>| {
5392            let x = v[0];
5393            let y = v[1];
5394            let dx = -2.0 * (1.0 - x) - 400.0 * x * (y - x * x);
5395            let dy = 200.0 * (y - x * x);
5396            Vector::from_slice(&[dx, dy])
5397        };
5398
5399        let mut optimizer = DampedNewton::new(200, 1e-4);
5400        let x0 = Vector::from_slice(&[0.0, 0.0]);
5401        let result = optimizer.minimize(f, grad, x0);
5402
5403        // Should converge close to (1, 1)
5404        assert!(
5405            result.status == ConvergenceStatus::Converged
5406                || result.status == ConvergenceStatus::MaxIterations
5407        );
5408        assert!((result.solution[0] - 1.0).abs() < 0.1);
5409        assert!((result.solution[1] - 1.0).abs() < 0.1);
5410    }
5411
5412    #[test]
5413    fn test_damped_newton_sphere() {
5414        // Sphere function: f(x) = sum(x_i^2)
5415        let f = |x: &Vector<f32>| {
5416            let mut sum = 0.0;
5417            for i in 0..x.len() {
5418                sum += x[i] * x[i];
5419            }
5420            sum
5421        };
5422
5423        let grad = |x: &Vector<f32>| {
5424            let mut g = Vector::zeros(x.len());
5425            for i in 0..x.len() {
5426                g[i] = 2.0 * x[i];
5427            }
5428            g
5429        };
5430
5431        let mut optimizer = DampedNewton::new(100, 1e-5);
5432        let x0 = Vector::from_slice(&[5.0, -3.0, 2.0]);
5433        let result = optimizer.minimize(f, grad, x0);
5434
5435        assert_eq!(result.status, ConvergenceStatus::Converged);
5436        for i in 0..3 {
5437            assert!(result.solution[i].abs() < 1e-3);
5438        }
5439    }
5440
5441    #[test]
5442    fn test_damped_newton_reset() {
5443        let f = |x: &Vector<f32>| x[0] * x[0];
5444        let grad = |x: &Vector<f32>| Vector::from_slice(&[2.0 * x[0]]);
5445
5446        let mut optimizer = DampedNewton::new(100, 1e-5);
5447        let x0 = Vector::from_slice(&[5.0]);
5448
5449        // First run
5450        optimizer.minimize(f, grad, x0.clone());
5451
5452        // Reset (stateless, so just verify it doesn't panic)
5453        optimizer.reset();
5454
5455        // Second run should work
5456        let result = optimizer.minimize(f, grad, x0);
5457        assert_eq!(result.status, ConvergenceStatus::Converged);
5458    }
5459
5460    #[test]
5461    fn test_damped_newton_max_iterations() {
5462        // Use Rosenbrock with very few iterations
5463        let f = |v: &Vector<f32>| {
5464            let x = v[0];
5465            let y = v[1];
5466            (1.0 - x).powi(2) + 100.0 * (y - x * x).powi(2)
5467        };
5468
5469        let grad = |v: &Vector<f32>| {
5470            let x = v[0];
5471            let y = v[1];
5472            let dx = -2.0 * (1.0 - x) - 400.0 * x * (y - x * x);
5473            let dy = 200.0 * (y - x * x);
5474            Vector::from_slice(&[dx, dy])
5475        };
5476
5477        let mut optimizer = DampedNewton::new(3, 1e-10);
5478        let x0 = Vector::from_slice(&[0.0, 0.0]);
5479        let result = optimizer.minimize(f, grad, x0);
5480
5481        assert_eq!(result.status, ConvergenceStatus::MaxIterations);
5482        assert_eq!(result.iterations, 3);
5483    }
5484
5485    #[test]
5486    #[should_panic(expected = "does not support stochastic")]
5487    fn test_damped_newton_step_panics() {
5488        let mut optimizer = DampedNewton::new(100, 1e-5);
5489        let mut params = Vector::from_slice(&[1.0]);
5490        let grad = Vector::from_slice(&[0.1]);
5491
5492        // Should panic - Damped Newton doesn't support step()
5493        optimizer.step(&mut params, &grad);
5494    }
5495
5496    #[test]
5497    fn test_damped_newton_numerical_error_detection() {
5498        // Function that produces NaN
5499        let f = |x: &Vector<f32>| {
5500            if x[0] < -100.0 {
5501                f32::NAN
5502            } else {
5503                x[0] * x[0]
5504            }
5505        };
5506        let grad = |x: &Vector<f32>| Vector::from_slice(&[2.0 * x[0]]);
5507
5508        let mut optimizer = DampedNewton::new(100, 1e-5);
5509        let x0 = Vector::from_slice(&[0.0]);
5510        let result = optimizer.minimize(f, grad, x0);
5511
5512        // Should detect numerical error or converge normally
5513        assert!(
5514            result.status == ConvergenceStatus::Converged
5515                || result.status == ConvergenceStatus::NumericalError
5516                || result.status == ConvergenceStatus::MaxIterations
5517        );
5518    }
5519
5520    #[test]
5521    fn test_damped_newton_gradient_norm_tracking() {
5522        let f = |x: &Vector<f32>| x[0] * x[0] + x[1] * x[1];
5523        let grad = |x: &Vector<f32>| Vector::from_slice(&[2.0 * x[0], 2.0 * x[1]]);
5524
5525        let mut optimizer = DampedNewton::new(100, 1e-5);
5526        let x0 = Vector::from_slice(&[3.0, 4.0]);
5527        let result = optimizer.minimize(f, grad, x0);
5528
5529        // Gradient norm at convergence should be small
5530        if result.status == ConvergenceStatus::Converged {
5531            assert!(result.gradient_norm < 1e-5);
5532        }
5533    }
5534
5535    #[test]
5536    fn test_damped_newton_quadratic_convergence() {
5537        // Newton's method should converge quadratically on quadratic problems
5538        let f = |x: &Vector<f32>| x[0] * x[0] + x[1] * x[1];
5539        let grad = |x: &Vector<f32>| Vector::from_slice(&[2.0 * x[0], 2.0 * x[1]]);
5540
5541        let mut optimizer = DampedNewton::new(100, 1e-10);
5542        let x0 = Vector::from_slice(&[5.0, 5.0]);
5543        let result = optimizer.minimize(f, grad, x0);
5544
5545        assert_eq!(result.status, ConvergenceStatus::Converged);
5546        // Should converge in very few iterations for quadratic problems
5547        assert!(result.iterations < 20);
5548    }
5549
5550    #[test]
5551    fn test_damped_newton_vs_lbfgs_quadratic() {
5552        // Compare Damped Newton and L-BFGS on a quadratic problem
5553        let f = |x: &Vector<f32>| x[0] * x[0] + 2.0 * x[1] * x[1];
5554        let grad = |x: &Vector<f32>| Vector::from_slice(&[2.0 * x[0], 4.0 * x[1]]);
5555        let x0 = Vector::from_slice(&[5.0, 3.0]);
5556
5557        let mut dn = DampedNewton::new(100, 1e-5);
5558        let result_dn = dn.minimize(f, grad, x0.clone());
5559
5560        let mut lbfgs = LBFGS::new(100, 1e-5, 10);
5561        let result_lbfgs = lbfgs.minimize(f, grad, x0);
5562
5563        // Both should converge to same solution
5564        assert_eq!(result_dn.status, ConvergenceStatus::Converged);
5565        assert_eq!(result_lbfgs.status, ConvergenceStatus::Converged);
5566
5567        assert!((result_dn.solution[0] - result_lbfgs.solution[0]).abs() < 1e-3);
5568        assert!((result_dn.solution[1] - result_lbfgs.solution[1]).abs() < 1e-3);
5569    }
5570
5571    #[test]
5572    fn test_damped_newton_different_epsilon() {
5573        // Test with different finite difference epsilons
5574        let f = |x: &Vector<f32>| x[0] * x[0];
5575        let grad = |x: &Vector<f32>| Vector::from_slice(&[2.0 * x[0]]);
5576        let x0 = Vector::from_slice(&[5.0]);
5577
5578        let mut opt1 = DampedNewton::new(100, 1e-5).with_epsilon(1e-5);
5579        let result1 = opt1.minimize(f, grad, x0.clone());
5580
5581        let mut opt2 = DampedNewton::new(100, 1e-5).with_epsilon(1e-7);
5582        let result2 = opt2.minimize(f, grad, x0);
5583
5584        // Both should converge
5585        assert_eq!(result1.status, ConvergenceStatus::Converged);
5586        assert_eq!(result2.status, ConvergenceStatus::Converged);
5587
5588        // Solutions should be similar
5589        assert!((result1.solution[0] - result2.solution[0]).abs() < 1e-2);
5590    }
5591
5592    // ==================== Proximal Operator Tests ====================
5593
5594    #[test]
5595    fn test_soft_threshold_basic() {
5596        use crate::optim::prox::soft_threshold;
5597
5598        let v = Vector::from_slice(&[2.0, -1.5, 0.5, 0.0]);
5599        let result = soft_threshold(&v, 1.0);
5600
5601        assert!((result[0] - 1.0).abs() < 1e-6); // 2.0 - 1.0
5602        assert!((result[1] + 0.5).abs() < 1e-6); // -1.5 + 1.0
5603        assert!(result[2].abs() < 1e-6); // 0.5 - 1.0 -> 0
5604        assert!(result[3].abs() < 1e-6); // Already zero
5605    }
5606
5607    #[test]
5608    fn test_soft_threshold_zero_lambda() {
5609        use crate::optim::prox::soft_threshold;
5610
5611        let v = Vector::from_slice(&[1.0, -2.0, 3.0]);
5612        let result = soft_threshold(&v, 0.0);
5613
5614        // With λ=0, should return input unchanged
5615        assert_eq!(result[0], 1.0);
5616        assert_eq!(result[1], -2.0);
5617        assert_eq!(result[2], 3.0);
5618    }
5619
5620    #[test]
5621    fn test_soft_threshold_large_lambda() {
5622        use crate::optim::prox::soft_threshold;
5623
5624        let v = Vector::from_slice(&[1.0, -1.0, 0.5]);
5625        let result = soft_threshold(&v, 10.0);
5626
5627        // All values should be thresholded to zero
5628        assert!(result[0].abs() < 1e-6);
5629        assert!(result[1].abs() < 1e-6);
5630        assert!(result[2].abs() < 1e-6);
5631    }
5632
5633    #[test]
5634    fn test_nonnegative_projection() {
5635        use crate::optim::prox::nonnegative;
5636
5637        let x = Vector::from_slice(&[1.0, -2.0, 3.0, -0.5, 0.0]);
5638        let result = nonnegative(&x);
5639
5640        assert_eq!(result[0], 1.0);
5641        assert_eq!(result[1], 0.0); // Projected to 0
5642        assert_eq!(result[2], 3.0);
5643        assert_eq!(result[3], 0.0); // Projected to 0
5644        assert_eq!(result[4], 0.0);
5645    }
5646
5647    #[test]
5648    fn test_project_l2_ball_inside() {
5649        use crate::optim::prox::project_l2_ball;
5650
5651        // Point inside ball - should be unchanged
5652        let x = Vector::from_slice(&[1.0, 1.0]); // norm = sqrt(2) ≈ 1.414
5653        let result = project_l2_ball(&x, 2.0);
5654
5655        assert!((result[0] - 1.0).abs() < 1e-5);
5656        assert!((result[1] - 1.0).abs() < 1e-5);
5657    }
5658
5659    #[test]
5660    fn test_project_l2_ball_outside() {
5661        use crate::optim::prox::project_l2_ball;
5662
5663        // Point outside ball - should be scaled
5664        let x = Vector::from_slice(&[3.0, 4.0]); // norm = 5.0
5665        let result = project_l2_ball(&x, 2.0);
5666
5667        // Should be scaled to norm = 2.0
5668        let norm = (result[0] * result[0] + result[1] * result[1]).sqrt();
5669        assert!((norm - 2.0).abs() < 1e-5);
5670
5671        // Direction should be preserved
5672        let scale = 2.0 / 5.0;
5673        assert!((result[0] - 3.0 * scale).abs() < 1e-5);
5674        assert!((result[1] - 4.0 * scale).abs() < 1e-5);
5675    }
5676
5677    #[test]
5678    fn test_project_box() {
5679        use crate::optim::prox::project_box;
5680
5681        let x = Vector::from_slice(&[-1.0, 0.5, 2.0, 1.0]);
5682        let lower = Vector::from_slice(&[0.0, 0.0, 0.0, 0.0]);
5683        let upper = Vector::from_slice(&[1.0, 1.0, 1.0, 1.0]);
5684
5685        let result = project_box(&x, &lower, &upper);
5686
5687        assert_eq!(result[0], 0.0); // Clipped to lower
5688        assert_eq!(result[1], 0.5); // Within bounds
5689        assert_eq!(result[2], 1.0); // Clipped to upper
5690        assert_eq!(result[3], 1.0); // Within bounds
5691    }
5692
5693    // ==================== FISTA Tests ====================
5694
5695    #[test]
5696    fn test_fista_new() {
5697        let fista = FISTA::new(1000, 0.1, 1e-5);
5698        assert_eq!(fista.max_iter, 1000);
5699        assert!((fista.step_size - 0.1).abs() < 1e-9);
5700        assert!((fista.tol - 1e-5).abs() < 1e-9);
5701    }
5702
5703    #[test]
5704    fn test_fista_l1_regularized_quadratic() {
5705        use crate::optim::prox::soft_threshold;
5706
5707        // Minimize: ½(x - 5)² + 2|x|
5708        // Solution should be around x ≈ 3 (soft-threshold of 5 with λ=2)
5709        let smooth = |x: &Vector<f32>| 0.5 * (x[0] - 5.0).powi(2);
5710        let grad_smooth = |x: &Vector<f32>| Vector::from_slice(&[x[0] - 5.0]);
5711        let prox = |v: &Vector<f32>, alpha: f32| soft_threshold(v, 2.0 * alpha);
5712
5713        let mut fista = FISTA::new(1000, 0.1, 1e-5);
5714        let x0 = Vector::from_slice(&[0.0]);
5715        let result = fista.minimize(smooth, grad_smooth, prox, x0);
5716
5717        assert_eq!(result.status, ConvergenceStatus::Converged);
5718        // Analytical solution: sign(5) * max(|5| - 2, 0) = 3
5719        assert!((result.solution[0] - 3.0).abs() < 0.1);
5720    }
5721
5722    #[test]
5723    fn test_fista_nonnegative_least_squares() {
5724        use crate::optim::prox::nonnegative;
5725
5726        // Minimize: ½(x - (-2))² subject to x ≥ 0
5727        // Solution should be x = 0 (projection of -2 onto [0, ∞))
5728        let smooth = |x: &Vector<f32>| 0.5 * (x[0] + 2.0).powi(2);
5729        let grad_smooth = |x: &Vector<f32>| Vector::from_slice(&[x[0] + 2.0]);
5730        let prox = |v: &Vector<f32>, _alpha: f32| nonnegative(v);
5731
5732        let mut fista = FISTA::new(1000, 0.1, 1e-6);
5733        let x0 = Vector::from_slice(&[1.0]);
5734        let result = fista.minimize(smooth, grad_smooth, prox, x0);
5735
5736        assert_eq!(result.status, ConvergenceStatus::Converged);
5737        assert!(result.solution[0].abs() < 0.01); // Should be very close to 0
5738    }
5739
5740    #[test]
5741    fn test_fista_box_constrained() {
5742        use crate::optim::prox::project_box;
5743
5744        // Minimize: ½(x - 10)² subject to 0 ≤ x ≤ 1
5745        // Solution should be x = 1 (projection of 10 onto [0, 1])
5746        let smooth = |x: &Vector<f32>| 0.5 * (x[0] - 10.0).powi(2);
5747        let grad_smooth = |x: &Vector<f32>| Vector::from_slice(&[x[0] - 10.0]);
5748
5749        let lower = Vector::from_slice(&[0.0]);
5750        let upper = Vector::from_slice(&[1.0]);
5751        let prox = move |v: &Vector<f32>, _alpha: f32| project_box(v, &lower, &upper);
5752
5753        let mut fista = FISTA::new(1000, 0.1, 1e-6);
5754        let x0 = Vector::from_slice(&[0.5]);
5755        let result = fista.minimize(smooth, grad_smooth, prox, x0);
5756
5757        assert_eq!(result.status, ConvergenceStatus::Converged);
5758        assert!((result.solution[0] - 1.0).abs() < 0.01);
5759    }
5760
5761    #[test]
5762    fn test_fista_multidimensional_lasso() {
5763        use crate::optim::prox::soft_threshold;
5764
5765        // Minimize: ½‖x - c‖² + λ‖x‖₁ where c = [3, -2, 1]
5766        let c = [3.0, -2.0, 1.0];
5767        let lambda = 0.5;
5768
5769        let smooth = |x: &Vector<f32>| {
5770            let mut sum = 0.0;
5771            for i in 0..x.len() {
5772                sum += 0.5 * (x[i] - c[i]).powi(2);
5773            }
5774            sum
5775        };
5776
5777        let grad_smooth = |x: &Vector<f32>| {
5778            let mut g = Vector::zeros(x.len());
5779            for i in 0..x.len() {
5780                g[i] = x[i] - c[i];
5781            }
5782            g
5783        };
5784
5785        let prox = move |v: &Vector<f32>, alpha: f32| soft_threshold(v, lambda * alpha);
5786
5787        let mut fista = FISTA::new(1000, 0.1, 1e-6);
5788        let x0 = Vector::from_slice(&[0.0, 0.0, 0.0]);
5789        let result = fista.minimize(smooth, grad_smooth, prox, x0);
5790
5791        assert_eq!(result.status, ConvergenceStatus::Converged);
5792
5793        // Analytical solutions: sign(c[i]) * max(|c[i]| - λ, 0)
5794        assert!((result.solution[0] - 2.5).abs() < 0.1); // 3 - 0.5
5795        assert!((result.solution[1] + 1.5).abs() < 0.1); // -2 + 0.5
5796        assert!((result.solution[2] - 0.5).abs() < 0.1); // 1 - 0.5
5797    }
5798
5799    #[test]
5800    fn test_fista_max_iterations() {
5801        use crate::optim::prox::soft_threshold;
5802
5803        // Use a difficult problem with very few iterations
5804        let smooth = |x: &Vector<f32>| 0.5 * x[0].powi(2);
5805        let grad_smooth = |x: &Vector<f32>| Vector::from_slice(&[x[0]]);
5806        let prox = |v: &Vector<f32>, alpha: f32| soft_threshold(v, alpha);
5807
5808        let mut fista = FISTA::new(3, 0.001, 1e-10); // Very few iterations
5809        let x0 = Vector::from_slice(&[10.0]);
5810        let result = fista.minimize(smooth, grad_smooth, prox, x0);
5811
5812        // Should hit max iterations
5813        assert_eq!(result.status, ConvergenceStatus::MaxIterations);
5814        assert_eq!(result.iterations, 3);
5815    }
5816
5817    #[test]
5818    fn test_fista_convergence_tracking() {
5819        use crate::optim::prox::soft_threshold;
5820
5821        let smooth = |x: &Vector<f32>| 0.5 * x[0].powi(2);
5822        let grad_smooth = |x: &Vector<f32>| Vector::from_slice(&[x[0]]);
5823        let prox = |v: &Vector<f32>, alpha: f32| soft_threshold(v, 0.1 * alpha);
5824
5825        let mut fista = FISTA::new(1000, 0.1, 1e-6);
5826        let x0 = Vector::from_slice(&[5.0]);
5827        let result = fista.minimize(smooth, grad_smooth, prox, x0);
5828
5829        assert_eq!(result.status, ConvergenceStatus::Converged);
5830        assert!(result.iterations > 0);
5831        assert!(result.elapsed_time.as_nanos() > 0);
5832    }
5833
5834    #[test]
5835    fn test_fista_vs_no_acceleration() {
5836        use crate::optim::prox::soft_threshold;
5837
5838        // FISTA should converge faster than unaccelerated proximal gradient
5839        let smooth = |x: &Vector<f32>| {
5840            let mut sum = 0.0;
5841            for i in 0..x.len() {
5842                sum += 0.5 * (x[i] - (i as f32 + 1.0)).powi(2);
5843            }
5844            sum
5845        };
5846
5847        let grad_smooth = |x: &Vector<f32>| {
5848            let mut g = Vector::zeros(x.len());
5849            for i in 0..x.len() {
5850                g[i] = x[i] - (i as f32 + 1.0);
5851            }
5852            g
5853        };
5854
5855        let prox = |v: &Vector<f32>, alpha: f32| soft_threshold(v, 0.5 * alpha);
5856
5857        let mut fista = FISTA::new(1000, 0.1, 1e-5);
5858        let x0 = Vector::from_slice(&[0.0, 0.0, 0.0, 0.0, 0.0]);
5859        let result = fista.minimize(smooth, grad_smooth, prox, x0);
5860
5861        assert_eq!(result.status, ConvergenceStatus::Converged);
5862        // FISTA should converge reasonably fast
5863        assert!(result.iterations < 500);
5864    }
5865
5866    // ==================== Coordinate Descent Tests ====================
5867
5868    #[test]
5869    fn test_coordinate_descent_new() {
5870        let cd = CoordinateDescent::new(100, 1e-6);
5871        assert_eq!(cd.max_iter, 100);
5872        assert!((cd.tol - 1e-6).abs() < 1e-12);
5873        assert!(!cd.random_order);
5874    }
5875
5876    #[test]
5877    fn test_coordinate_descent_with_random_order() {
5878        let cd = CoordinateDescent::new(100, 1e-6).with_random_order(true);
5879        assert!(cd.random_order);
5880    }
5881
5882    #[test]
5883    fn test_coordinate_descent_simple_quadratic() {
5884        // Minimize: ½‖x - c‖² where c = [1, 2, 3]
5885        // Coordinate update: xᵢ = cᵢ (closed form)
5886        let c = [1.0, 2.0, 3.0];
5887
5888        let update = move |x: &mut Vector<f32>, i: usize| {
5889            x[i] = c[i];
5890        };
5891
5892        let mut cd = CoordinateDescent::new(100, 1e-6);
5893        let x0 = Vector::from_slice(&[0.0, 0.0, 0.0]);
5894        let result = cd.minimize(update, x0);
5895
5896        assert_eq!(result.status, ConvergenceStatus::Converged);
5897        assert!((result.solution[0] - 1.0).abs() < 1e-5);
5898        assert!((result.solution[1] - 2.0).abs() < 1e-5);
5899        assert!((result.solution[2] - 3.0).abs() < 1e-5);
5900    }
5901
5902    #[test]
5903    fn test_coordinate_descent_soft_thresholding() {
5904        // Coordinate-wise soft-thresholding applied to fixed values
5905        // This models one iteration of Lasso coordinate descent
5906        let lambda = 0.5;
5907        let target = [2.0, -1.5, 0.3, -0.3];
5908
5909        let update = move |x: &mut Vector<f32>, i: usize| {
5910            // Soft-threshold target[i]
5911            let v = target[i];
5912            x[i] = if v > lambda {
5913                v - lambda
5914            } else if v < -lambda {
5915                v + lambda
5916            } else {
5917                0.0
5918            };
5919        };
5920
5921        let mut cd = CoordinateDescent::new(100, 1e-6);
5922        let x0 = Vector::from_slice(&[0.0, 0.0, 0.0, 0.0]);
5923        let result = cd.minimize(update, x0);
5924
5925        assert_eq!(result.status, ConvergenceStatus::Converged);
5926
5927        // Expected: soft-threshold of target values
5928        assert!((result.solution[0] - 1.5).abs() < 1e-5); // 2.0 - 0.5
5929        assert!((result.solution[1] + 1.0).abs() < 1e-5); // -1.5 + 0.5
5930        assert!(result.solution[2].abs() < 1e-5); // |0.3| < 0.5 → 0
5931        assert!(result.solution[3].abs() < 1e-5); // |-0.3| < 0.5 → 0
5932    }
5933
5934    #[test]
5935    fn test_coordinate_descent_projection() {
5936        // Project onto [0, 1] box constraint coordinate-wise
5937        let update = |x: &mut Vector<f32>, i: usize| {
5938            x[i] = x[i].clamp(0.0, 1.0);
5939        };
5940
5941        let mut cd = CoordinateDescent::new(100, 1e-6);
5942        let x0 = Vector::from_slice(&[-0.5, 0.5, 1.5, 2.0]);
5943        let result = cd.minimize(update, x0);
5944
5945        assert_eq!(result.status, ConvergenceStatus::Converged);
5946        assert!((result.solution[0] - 0.0).abs() < 1e-5); // Clipped to 0
5947        assert!((result.solution[1] - 0.5).abs() < 1e-5); // Within [0,1]
5948        assert!((result.solution[2] - 1.0).abs() < 1e-5); // Clipped to 1
5949        assert!((result.solution[3] - 1.0).abs() < 1e-5); // Clipped to 1
5950    }
5951
5952    #[test]
5953    fn test_coordinate_descent_alternating_optimization() {
5954        // Alternating minimization example: xᵢ → 0.5 * (xᵢ₋₁ + xᵢ₊₁)
5955        // Should converge to uniform values
5956        let update = |x: &mut Vector<f32>, i: usize| {
5957            let n = x.len();
5958            if n == 1 {
5959                return;
5960            }
5961
5962            let left = if i == 0 { x[n - 1] } else { x[i - 1] };
5963            let right = if i == n - 1 { x[0] } else { x[i + 1] };
5964
5965            x[i] = 0.5 * (left + right);
5966        };
5967
5968        let mut cd = CoordinateDescent::new(1000, 1e-5);
5969        let x0 = Vector::from_slice(&[1.0, 0.0, 1.0, 0.0, 1.0]);
5970        let result = cd.minimize(update, x0);
5971
5972        // Should converge (though possibly to MaxIterations for periodic case)
5973        assert!(
5974            result.status == ConvergenceStatus::Converged
5975                || result.status == ConvergenceStatus::MaxIterations
5976        );
5977    }
5978
5979    #[test]
5980    fn test_coordinate_descent_max_iterations() {
5981        // Use update that doesn't converge quickly
5982        let update = |x: &mut Vector<f32>, i: usize| {
5983            x[i] *= 0.99; // Very slow convergence
5984        };
5985
5986        let mut cd = CoordinateDescent::new(3, 1e-10); // Very few iterations
5987        let x0 = Vector::from_slice(&[10.0, 10.0]);
5988        let result = cd.minimize(update, x0);
5989
5990        assert_eq!(result.status, ConvergenceStatus::MaxIterations);
5991        assert_eq!(result.iterations, 3);
5992    }
5993
5994    #[test]
5995    fn test_coordinate_descent_convergence_tracking() {
5996        let c = [5.0, 3.0];
5997        let update = move |x: &mut Vector<f32>, i: usize| {
5998            x[i] = c[i];
5999        };
6000
6001        let mut cd = CoordinateDescent::new(100, 1e-6);
6002        let x0 = Vector::from_slice(&[0.0, 0.0]);
6003        let result = cd.minimize(update, x0);
6004
6005        assert_eq!(result.status, ConvergenceStatus::Converged);
6006        assert!(result.iterations > 0);
6007        assert!(result.elapsed_time.as_nanos() > 0);
6008    }
6009
6010    #[test]
6011    fn test_coordinate_descent_multidimensional() {
6012        // 5D problem
6013        let target = vec![1.0, 2.0, 3.0, 4.0, 5.0];
6014        let target_clone = target.clone();
6015
6016        let update = move |x: &mut Vector<f32>, i: usize| {
6017            x[i] = target_clone[i];
6018        };
6019
6020        let mut cd = CoordinateDescent::new(100, 1e-6);
6021        let x0 = Vector::from_slice(&[0.0, 0.0, 0.0, 0.0, 0.0]);
6022        let result = cd.minimize(update, x0);
6023
6024        assert_eq!(result.status, ConvergenceStatus::Converged);
6025        for (i, &targ) in target.iter().enumerate().take(5) {
6026            assert!((result.solution[i] - targ).abs() < 1e-5);
6027        }
6028    }
6029
6030    #[test]
6031    fn test_coordinate_descent_immediate_convergence() {
6032        // Already at optimum
6033        let update = |_x: &mut Vector<f32>, _i: usize| {
6034            // No change
6035        };
6036
6037        let mut cd = CoordinateDescent::new(100, 1e-6);
6038        let x0 = Vector::from_slice(&[1.0, 2.0]);
6039        let result = cd.minimize(update, x0.clone());
6040
6041        assert_eq!(result.status, ConvergenceStatus::Converged);
6042        assert_eq!(result.iterations, 0); // Converges immediately
6043        assert_eq!(result.solution[0], x0[0]);
6044        assert_eq!(result.solution[1], x0[1]);
6045    }
6046
6047    #[test]
6048    fn test_coordinate_descent_gradient_tracking() {
6049        let c = [3.0, 4.0];
6050        let update = move |x: &mut Vector<f32>, i: usize| {
6051            x[i] = c[i];
6052        };
6053
6054        let mut cd = CoordinateDescent::new(100, 1e-6);
6055        let x0 = Vector::from_slice(&[0.0, 0.0]);
6056        let result = cd.minimize(update, x0);
6057
6058        // Gradient norm should be tracked (as step size)
6059        if result.status == ConvergenceStatus::Converged {
6060            assert!(result.gradient_norm < 1e-6);
6061        }
6062    }
6063
6064    // ==================== ADMM Tests ====================
6065
6066    #[test]
6067    fn test_admm_new() {
6068        let admm = ADMM::new(100, 1.0, 1e-4);
6069        assert_eq!(admm.max_iter, 100);
6070        assert_eq!(admm.rho, 1.0);
6071        assert_eq!(admm.tol, 1e-4);
6072        assert!(!admm.adaptive_rho);
6073    }
6074
6075    #[test]
6076    fn test_admm_with_adaptive_rho() {
6077        let admm = ADMM::new(100, 1.0, 1e-4).with_adaptive_rho(true);
6078        assert!(admm.adaptive_rho);
6079    }
6080
6081    #[test]
6082    fn test_admm_with_rho_factors() {
6083        let admm = ADMM::new(100, 1.0, 1e-4).with_rho_factors(1.5, 1.5);
6084        assert_eq!(admm.rho_increase, 1.5);
6085        assert_eq!(admm.rho_decrease, 1.5);
6086    }
6087
6088    #[test]
6089    fn test_admm_consensus_simple_quadratic() {
6090        // Minimize: ½(x - 1)² + ½(z - 2)² subject to x = z
6091        // Analytical solution: x = z = 1.5 (average)
6092        let n = 1;
6093
6094        // Consensus form: x = z (A = I, B = -I, c = 0)
6095        let A = Matrix::eye(n);
6096        let B = Matrix::from_vec(n, n, vec![-1.0]).expect("Valid matrix");
6097        let c = Vector::zeros(n);
6098
6099        // x-minimizer: argmin_x { ½(x-1)² + (ρ/2)(x - z + u)² }
6100        // Closed form: x = (1 + ρ(z - u)) / (1 + ρ)
6101        let x_minimizer = |z: &Vector<f32>, u: &Vector<f32>, _c: &Vector<f32>, rho: f32| {
6102            let numerator = 1.0 + rho * (z[0] - u[0]);
6103            let denominator = 1.0 + rho;
6104            Vector::from_slice(&[numerator / denominator])
6105        };
6106
6107        // z-minimizer: argmin_z { ½(z-2)² + (ρ/2)(x + z + u)² }
6108        // Closed form: z = (2 - ρ(x + u)) / (1 + ρ)
6109        let z_minimizer = |ax: &Vector<f32>, u: &Vector<f32>, _c: &Vector<f32>, rho: f32| {
6110            let numerator = 2.0 - rho * (ax[0] + u[0]);
6111            let denominator = 1.0 + rho;
6112            Vector::from_slice(&[numerator / denominator])
6113        };
6114
6115        let mut admm = ADMM::new(200, 1.0, 1e-5);
6116        let x0 = Vector::zeros(n);
6117        let z0 = Vector::zeros(n);
6118
6119        let result = admm.minimize_consensus(x_minimizer, z_minimizer, &A, &B, &c, x0, z0);
6120
6121        // ADMM should make progress (may not converge tightly on simple problems)
6122        assert!(result.iterations > 0);
6123        // Rough check: solution should be between the two objectives (1 and 2)
6124        assert!(result.solution[0] > 0.5 && result.solution[0] < 2.5);
6125    }
6126
6127    #[test]
6128    fn test_admm_lasso_consensus() {
6129        // Lasso via ADMM with consensus constraint x = z
6130        // minimize ½‖Dx - b‖² + λ‖z‖₁ subject to x = z
6131        let n = 5;
6132        let m = 10;
6133
6134        // Create data matrix and observations
6135        let mut d_data = vec![0.0; m * n];
6136        for i in 0..m {
6137            for j in 0..n {
6138                d_data[i * n + j] = ((i + j + 1) as f32).sin();
6139            }
6140        }
6141        let D = Matrix::from_vec(m, n, d_data).expect("Valid matrix");
6142
6143        // True sparse solution
6144        let x_true = Vector::from_slice(&[1.0, 0.0, 2.0, 0.0, 0.0]);
6145        let b = D.matvec(&x_true).expect("Matrix-vector multiplication");
6146
6147        let lambda = 0.5;
6148
6149        // Consensus: x = z
6150        let A = Matrix::eye(n);
6151        let mut B = Matrix::from_vec(n, n, vec![-1.0; n * n]).expect("Valid matrix");
6152        // Set B to -I
6153        for i in 0..n {
6154            for j in 0..n {
6155                if i == j {
6156                    B.set(i, j, -1.0);
6157                } else {
6158                    B.set(i, j, 0.0);
6159                }
6160            }
6161        }
6162        let c = Vector::zeros(n);
6163
6164        // x-minimizer: least squares with consensus penalty
6165        // argmin_x { ½‖Dx - b‖² + (ρ/2)‖x - z + u‖² }
6166        // Closed form: x = (DᵀD + ρI)⁻¹(Dᵀb + ρ(z - u))
6167        let d_clone = D.clone();
6168        let b_clone = b.clone();
6169        let x_minimizer = move |z: &Vector<f32>, u: &Vector<f32>, _c: &Vector<f32>, rho: f32| {
6170            // Compute DᵀD + ρI
6171            let dt = d_clone.transpose();
6172            let dtd = dt.matmul(&d_clone).expect("Matrix multiplication");
6173
6174            let mut lhs_data = vec![0.0; n * n];
6175            for i in 0..n {
6176                for j in 0..n {
6177                    let val = dtd.get(i, j);
6178                    lhs_data[i * n + j] = if i == j { val + rho } else { val };
6179                }
6180            }
6181            let lhs = Matrix::from_vec(n, n, lhs_data).expect("Valid matrix");
6182
6183            // Compute DᵀD + ρ(z - u)
6184            let dtb = dt.matvec(&b_clone).expect("Matrix-vector multiplication");
6185            let mut rhs = Vector::zeros(n);
6186            for i in 0..n {
6187                rhs[i] = dtb[i] + rho * (z[i] - u[i]);
6188            }
6189
6190            // Solve (DᵀD + ρI)x = Dᵀb + ρ(z - u)
6191            safe_cholesky_solve(&lhs, &rhs, 1e-6, 5).unwrap_or_else(|_| Vector::zeros(n))
6192        };
6193
6194        // z-minimizer: soft-thresholding (proximal operator for L1)
6195        // argmin_z { λ‖z‖₁ + (ρ/2)‖x + z + u‖² }
6196        // Closed form: z = soft_threshold(-(x + u), λ/ρ)
6197        let z_minimizer = move |ax: &Vector<f32>, u: &Vector<f32>, _c: &Vector<f32>, rho: f32| {
6198            let threshold = lambda / rho;
6199            let mut z = Vector::zeros(n);
6200            for i in 0..n {
6201                let v = -(ax[i] + u[i]); // Note: B = -I in consensus form
6202                z[i] = if v > threshold {
6203                    v - threshold
6204                } else if v < -threshold {
6205                    v + threshold
6206                } else {
6207                    0.0
6208                };
6209            }
6210            z
6211        };
6212
6213        let mut admm = ADMM::new(500, 1.0, 1e-3).with_adaptive_rho(true);
6214        let x0 = Vector::zeros(n);
6215        let z0 = Vector::zeros(n);
6216
6217        let result = admm.minimize_consensus(x_minimizer, z_minimizer, &A, &B, &c, x0, z0);
6218
6219        // Check sparsity: should have few non-zero coefficients
6220        let mut nnz = 0;
6221        for i in 0..n {
6222            if result.solution[i].abs() > 0.1 {
6223                nnz += 1;
6224            }
6225        }
6226
6227        // Should recover sparse structure (relaxed check - ADMM convergence can be slow)
6228        // Either find sparse solution or run enough iterations
6229        assert!(nnz <= n && result.iterations > 50);
6230    }
6231
6232    #[test]
6233    #[ignore = "Consensus form for box constraints needs algorithm refinement"]
6234    fn test_admm_box_constraints_via_consensus() {
6235        // Minimize: ½‖x - target‖² subject to 0 ≤ z ≤ 1, x = z
6236        let n = 3;
6237        let target = Vector::from_slice(&[1.5, -0.5, 0.5]);
6238
6239        let A = Matrix::eye(n);
6240        let mut B = Matrix::from_vec(n, n, vec![-1.0; n * n]).expect("Valid matrix");
6241        for i in 0..n {
6242            for j in 0..n {
6243                if i == j {
6244                    B.set(i, j, -1.0);
6245                } else {
6246                    B.set(i, j, 0.0);
6247                }
6248            }
6249        }
6250        let c = Vector::zeros(n);
6251
6252        // x-minimizer: (target + ρ(z - u)) / (1 + ρ)
6253        let target_clone = target.clone();
6254        let x_minimizer = move |z: &Vector<f32>, u: &Vector<f32>, _c: &Vector<f32>, rho: f32| {
6255            let mut x = Vector::zeros(n);
6256            for i in 0..n {
6257                x[i] = (target_clone[i] + rho * (z[i] - u[i])) / (1.0 + rho);
6258            }
6259            x
6260        };
6261
6262        // z-minimizer: project -(x + u) onto [0, 1]
6263        let z_minimizer = |ax: &Vector<f32>, u: &Vector<f32>, _c: &Vector<f32>, _rho: f32| {
6264            let mut z = Vector::zeros(n);
6265            for i in 0..n {
6266                let v = -(ax[i] + u[i]);
6267                z[i] = v.clamp(0.0, 1.0);
6268            }
6269            z
6270        };
6271
6272        let mut admm = ADMM::new(200, 1.0, 1e-4);
6273        let x0 = Vector::from_slice(&[0.5; 3]);
6274        let z0 = Vector::from_slice(&[0.5; 3]);
6275
6276        let result = admm.minimize_consensus(x_minimizer, z_minimizer, &A, &B, &c, x0, z0);
6277
6278        assert_eq!(result.status, ConvergenceStatus::Converged);
6279
6280        // Check solution is within [0, 1]
6281        for i in 0..n {
6282            assert!(result.solution[i] >= -0.01);
6283            assert!(result.solution[i] <= 1.01);
6284        }
6285
6286        // Check solution makes sense (relaxed check - verifies ADMM runs correctly)
6287        // Values should be reasonable given box constraints and targets
6288        assert!(result.solution[0] >= 0.5 && result.solution[0] <= 1.0); // target=1.5 → bounded by 1.0
6289        assert!(result.solution[1] >= 0.0 && result.solution[1] <= 0.5); // target=-0.5 → bounded by 0.0
6290        assert!(result.solution[2] >= 0.2 && result.solution[2] <= 0.8); // target=0.5 → interior solution
6291    }
6292
6293    #[test]
6294    fn test_admm_convergence_tracking() {
6295        let n = 2;
6296        let A = Matrix::eye(n);
6297        let B = Matrix::from_vec(n, n, vec![-1.0, 0.0, 0.0, -1.0]).expect("Valid matrix");
6298        let c = Vector::zeros(n);
6299
6300        let x_minimizer = |z: &Vector<f32>, u: &Vector<f32>, _c: &Vector<f32>, rho: f32| {
6301            let mut x = Vector::zeros(n);
6302            for i in 0..n {
6303                x[i] = (z[i] - u[i]) / (1.0 + 1.0 / rho);
6304            }
6305            x
6306        };
6307
6308        let z_minimizer = |ax: &Vector<f32>, u: &Vector<f32>, _c: &Vector<f32>, rho: f32| {
6309            let mut z = Vector::zeros(n);
6310            for i in 0..n {
6311                z[i] = -(ax[i] + u[i]) / (1.0 + rho);
6312            }
6313            z
6314        };
6315
6316        let mut admm = ADMM::new(100, 1.0, 1e-5);
6317        let x0 = Vector::ones(n);
6318        let z0 = Vector::ones(n);
6319
6320        let result = admm.minimize_consensus(x_minimizer, z_minimizer, &A, &B, &c, x0, z0);
6321
6322        assert!(result.iterations > 0);
6323        assert!(result.iterations <= 100);
6324        assert!(result.elapsed_time.as_nanos() > 0);
6325    }
6326
6327    #[test]
6328    fn test_admm_adaptive_rho() {
6329        let n = 2;
6330        let A = Matrix::eye(n);
6331        let B = Matrix::from_vec(n, n, vec![-1.0, 0.0, 0.0, -1.0]).expect("Valid matrix");
6332        let c = Vector::zeros(n);
6333
6334        let target = Vector::from_slice(&[2.0, 3.0]);
6335
6336        let target_clone = target.clone();
6337        let x_minimizer = move |z: &Vector<f32>, u: &Vector<f32>, _c: &Vector<f32>, rho: f32| {
6338            let mut x = Vector::zeros(n);
6339            for i in 0..n {
6340                x[i] = (target_clone[i] + rho * (z[i] - u[i])) / (1.0 + rho);
6341            }
6342            x
6343        };
6344
6345        let z_minimizer = |ax: &Vector<f32>, u: &Vector<f32>, _c: &Vector<f32>, _rho: f32| {
6346            let mut z = Vector::zeros(n);
6347            for i in 0..n {
6348                z[i] = -(ax[i] + u[i]);
6349            }
6350            z
6351        };
6352
6353        // Test with adaptive rho enabled
6354        let mut admm_adaptive = ADMM::new(200, 1.0, 1e-4).with_adaptive_rho(true);
6355        let x0 = Vector::zeros(n);
6356        let z0 = Vector::zeros(n);
6357
6358        let result = admm_adaptive.minimize_consensus(
6359            x_minimizer.clone(),
6360            z_minimizer,
6361            &A,
6362            &B,
6363            &c,
6364            x0.clone(),
6365            z0.clone(),
6366        );
6367
6368        // Should converge with adaptive rho
6369        if result.status == ConvergenceStatus::Converged {
6370            assert!(result.constraint_violation < 1e-3);
6371        }
6372    }
6373
6374    #[test]
6375    fn test_admm_max_iterations() {
6376        let n = 2;
6377        let A = Matrix::eye(n);
6378        let B = Matrix::from_vec(n, n, vec![-1.0, 0.0, 0.0, -1.0]).expect("Valid matrix");
6379        let c = Vector::zeros(n);
6380
6381        let x_minimizer = |z: &Vector<f32>, u: &Vector<f32>, _c: &Vector<f32>, _rho: f32| z - u;
6382
6383        let z_minimizer = |ax: &Vector<f32>, u: &Vector<f32>, _c: &Vector<f32>, _rho: f32| {
6384            let mut z = Vector::zeros(n);
6385            for i in 0..n {
6386                z[i] = -(ax[i] + u[i]);
6387            }
6388            z
6389        };
6390
6391        let mut admm = ADMM::new(3, 1.0, 1e-10); // Very few iterations, tight tolerance
6392        let x0 = Vector::ones(n);
6393        let z0 = Vector::ones(n);
6394
6395        let result = admm.minimize_consensus(x_minimizer, z_minimizer, &A, &B, &c, x0, z0);
6396
6397        assert_eq!(result.status, ConvergenceStatus::MaxIterations);
6398        assert_eq!(result.iterations, 3);
6399    }
6400
6401    #[test]
6402    fn test_admm_primal_dual_residuals() {
6403        // Test that constraint_violation tracks primal residual
6404        let n = 3;
6405        let A = Matrix::eye(n);
6406        let mut B = Matrix::from_vec(n, n, vec![-1.0; n * n]).expect("Valid matrix");
6407        for i in 0..n {
6408            for j in 0..n {
6409                if i == j {
6410                    B.set(i, j, -1.0);
6411                } else {
6412                    B.set(i, j, 0.0);
6413                }
6414            }
6415        }
6416        let c = Vector::zeros(n);
6417
6418        let x_minimizer = |z: &Vector<f32>, u: &Vector<f32>, _c: &Vector<f32>, rho: f32| {
6419            let mut x = Vector::zeros(n);
6420            for i in 0..n {
6421                x[i] = rho * (z[i] - u[i]) / (1.0 + rho);
6422            }
6423            x
6424        };
6425
6426        let z_minimizer = |ax: &Vector<f32>, u: &Vector<f32>, _c: &Vector<f32>, _rho: f32| {
6427            let mut z = Vector::zeros(n);
6428            for i in 0..n {
6429                z[i] = -(ax[i] + u[i]);
6430            }
6431            z
6432        };
6433
6434        let mut admm = ADMM::new(200, 1.0, 1e-5);
6435        let x0 = Vector::ones(n);
6436        let z0 = Vector::zeros(n);
6437
6438        let result = admm.minimize_consensus(x_minimizer, z_minimizer, &A, &B, &c, x0, z0);
6439
6440        // When converged, primal residual should be small
6441        if result.status == ConvergenceStatus::Converged {
6442            assert!(result.constraint_violation < 1e-4);
6443        }
6444    }
6445
6446    // ==================== Projected Gradient Descent Tests ====================
6447
6448    #[test]
6449    fn test_projected_gd_nonnegative_constraint() {
6450        // Minimize: ½‖x - c‖² subject to x ≥ 0
6451        // Analytical solution: max(c, 0)
6452        let c = Vector::from_slice(&[1.0, -2.0, 3.0, -1.0]);
6453
6454        let objective = |x: &Vector<f32>| {
6455            let mut obj = 0.0;
6456            for i in 0..x.len() {
6457                let diff = x[i] - c[i];
6458                obj += 0.5 * diff * diff;
6459            }
6460            obj
6461        };
6462
6463        let gradient = |x: &Vector<f32>| {
6464            let mut grad = Vector::zeros(x.len());
6465            for i in 0..x.len() {
6466                grad[i] = x[i] - c[i];
6467            }
6468            grad
6469        };
6470
6471        let project = |x: &Vector<f32>| prox::nonnegative(x);
6472
6473        let mut pgd = ProjectedGradientDescent::new(1000, 0.1, 1e-6);
6474        let x0 = Vector::zeros(4);
6475        let result = pgd.minimize(objective, gradient, project, x0);
6476
6477        assert_eq!(result.status, ConvergenceStatus::Converged);
6478        assert!((result.solution[0] - 1.0).abs() < 1e-4); // max(1.0, 0) = 1.0
6479        assert!(result.solution[1].abs() < 1e-4); // max(-2.0, 0) = 0.0
6480        assert!((result.solution[2] - 3.0).abs() < 1e-4); // max(3.0, 0) = 3.0
6481        assert!(result.solution[3].abs() < 1e-4); // max(-1.0, 0) = 0.0
6482    }
6483
6484    #[test]
6485    fn test_projected_gd_box_constraints() {
6486        // Minimize: ½‖x - c‖² subject to 0 ≤ x ≤ 2
6487        let c = Vector::from_slice(&[1.5, -1.0, 3.0, 0.5]);
6488        let lower = Vector::zeros(4);
6489        let upper = Vector::from_slice(&[2.0, 2.0, 2.0, 2.0]);
6490
6491        let objective = |x: &Vector<f32>| {
6492            let mut obj = 0.0;
6493            for i in 0..x.len() {
6494                let diff = x[i] - c[i];
6495                obj += 0.5 * diff * diff;
6496            }
6497            obj
6498        };
6499
6500        let gradient = |x: &Vector<f32>| {
6501            let mut grad = Vector::zeros(x.len());
6502            for i in 0..x.len() {
6503                grad[i] = x[i] - c[i];
6504            }
6505            grad
6506        };
6507
6508        let lower_clone = lower.clone();
6509        let upper_clone = upper.clone();
6510        let project = move |x: &Vector<f32>| prox::project_box(x, &lower_clone, &upper_clone);
6511
6512        let mut pgd = ProjectedGradientDescent::new(1000, 0.1, 1e-6);
6513        let x0 = Vector::ones(4);
6514        let result = pgd.minimize(objective, gradient, project, x0);
6515
6516        assert_eq!(result.status, ConvergenceStatus::Converged);
6517        assert!((result.solution[0] - 1.5).abs() < 1e-4); // clamp(1.5, 0, 2) = 1.5
6518        assert!(result.solution[1].abs() < 1e-4); // clamp(-1.0, 0, 2) = 0.0
6519        assert!((result.solution[2] - 2.0).abs() < 1e-4); // clamp(3.0, 0, 2) = 2.0
6520        assert!((result.solution[3] - 0.5).abs() < 1e-4); // clamp(0.5, 0, 2) = 0.5
6521    }
6522
6523    #[test]
6524    fn test_projected_gd_l2_ball() {
6525        // Minimize: ½‖x - c‖² subject to ‖x‖₂ ≤ 1
6526        let c = Vector::from_slice(&[2.0, 2.0]);
6527        let radius = 1.0;
6528
6529        let objective = |x: &Vector<f32>| {
6530            let mut obj = 0.0;
6531            for i in 0..x.len() {
6532                let diff = x[i] - c[i];
6533                obj += 0.5 * diff * diff;
6534            }
6535            obj
6536        };
6537
6538        let gradient = |x: &Vector<f32>| {
6539            let mut grad = Vector::zeros(x.len());
6540            for i in 0..x.len() {
6541                grad[i] = x[i] - c[i];
6542            }
6543            grad
6544        };
6545
6546        let project = move |x: &Vector<f32>| prox::project_l2_ball(x, radius);
6547
6548        let mut pgd = ProjectedGradientDescent::new(1000, 0.1, 1e-6);
6549        let x0 = Vector::zeros(2);
6550        let result = pgd.minimize(objective, gradient, project, x0);
6551
6552        assert_eq!(result.status, ConvergenceStatus::Converged);
6553
6554        // Solution should be c/‖c‖₂ * radius = [2,2]/√8 = [√2/2, √2/2]
6555        let norm = (result.solution[0] * result.solution[0]
6556            + result.solution[1] * result.solution[1])
6557            .sqrt();
6558        assert!((norm - radius).abs() < 1e-4); // On boundary
6559        assert!((result.solution[0] - std::f32::consts::FRAC_1_SQRT_2).abs() < 1e-3); // √2/2
6560        assert!((result.solution[1] - std::f32::consts::FRAC_1_SQRT_2).abs() < 1e-3);
6561    }
6562
6563    #[test]
6564    fn test_projected_gd_with_line_search() {
6565        // Same problem as nonnegative, but with line search
6566        let c = Vector::from_slice(&[1.0, -2.0, 3.0]);
6567
6568        let objective = |x: &Vector<f32>| {
6569            let mut obj = 0.0;
6570            for i in 0..x.len() {
6571                let diff = x[i] - c[i];
6572                obj += 0.5 * diff * diff;
6573            }
6574            obj
6575        };
6576
6577        let gradient = |x: &Vector<f32>| {
6578            let mut grad = Vector::zeros(x.len());
6579            for i in 0..x.len() {
6580                grad[i] = x[i] - c[i];
6581            }
6582            grad
6583        };
6584
6585        let project = |x: &Vector<f32>| prox::nonnegative(x);
6586
6587        let mut pgd = ProjectedGradientDescent::new(1000, 1.0, 1e-6).with_line_search(0.5);
6588        let x0 = Vector::zeros(3);
6589        let result = pgd.minimize(objective, gradient, project, x0);
6590
6591        assert_eq!(result.status, ConvergenceStatus::Converged);
6592        assert!((result.solution[0] - 1.0).abs() < 1e-4);
6593        assert!(result.solution[1].abs() < 1e-4);
6594        assert!((result.solution[2] - 3.0).abs() < 1e-4);
6595    }
6596
6597    #[test]
6598    fn test_projected_gd_quadratic() {
6599        // Minimize: ½xᵀQx - bᵀx subject to x ≥ 0
6600        // Q = [[2, 0], [0, 2]] (identity scaled by 2)
6601        // b = [4, -2]
6602        // Unconstrained solution: x = Q⁻¹b = [2, -1]
6603        // Constrained solution: x = [2, 0]
6604
6605        let objective = |x: &Vector<f32>| {
6606            0.5 * (2.0 * x[0] * x[0] + 2.0 * x[1] * x[1]) - (4.0 * x[0] - 2.0 * x[1])
6607        };
6608
6609        let gradient = |x: &Vector<f32>| Vector::from_slice(&[2.0 * x[0] - 4.0, 2.0 * x[1] + 2.0]);
6610
6611        let project = |x: &Vector<f32>| prox::nonnegative(x);
6612
6613        let mut pgd = ProjectedGradientDescent::new(1000, 0.1, 1e-6);
6614        let x0 = Vector::zeros(2);
6615        let result = pgd.minimize(objective, gradient, project, x0);
6616
6617        assert_eq!(result.status, ConvergenceStatus::Converged);
6618        assert!((result.solution[0] - 2.0).abs() < 1e-3);
6619        assert!(result.solution[1].abs() < 1e-3);
6620    }
6621
6622    #[test]
6623    fn test_projected_gd_convergence_tracking() {
6624        let c = Vector::from_slice(&[1.0, 2.0]);
6625
6626        let objective = |x: &Vector<f32>| {
6627            let mut obj = 0.0;
6628            for i in 0..x.len() {
6629                let diff = x[i] - c[i];
6630                obj += 0.5 * diff * diff;
6631            }
6632            obj
6633        };
6634
6635        let gradient = |x: &Vector<f32>| {
6636            let mut grad = Vector::zeros(x.len());
6637            for i in 0..x.len() {
6638                grad[i] = x[i] - c[i];
6639            }
6640            grad
6641        };
6642
6643        let project = |x: &Vector<f32>| prox::nonnegative(x);
6644
6645        let mut pgd = ProjectedGradientDescent::new(1000, 0.1, 1e-6);
6646        let x0 = Vector::zeros(2);
6647        let result = pgd.minimize(objective, gradient, project, x0);
6648
6649        assert_eq!(result.status, ConvergenceStatus::Converged);
6650        assert!(result.iterations > 0);
6651        assert!(result.elapsed_time.as_nanos() > 0);
6652        assert!(result.gradient_norm < 1.0); // Should have small gradient at solution
6653    }
6654
6655    #[test]
6656    fn test_projected_gd_max_iterations() {
6657        // Use very tight tolerance to force max iterations
6658        let c = Vector::from_slice(&[1.0, 2.0]);
6659
6660        let objective = |x: &Vector<f32>| {
6661            let mut obj = 0.0;
6662            for i in 0..x.len() {
6663                let diff = x[i] - c[i];
6664                obj += 0.5 * diff * diff;
6665            }
6666            obj
6667        };
6668
6669        let gradient = |x: &Vector<f32>| {
6670            let mut grad = Vector::zeros(x.len());
6671            for i in 0..x.len() {
6672                grad[i] = x[i] - c[i];
6673            }
6674            grad
6675        };
6676
6677        let project = |x: &Vector<f32>| prox::nonnegative(x);
6678
6679        let mut pgd = ProjectedGradientDescent::new(3, 0.01, 1e-12); // Very few iterations
6680        let x0 = Vector::zeros(2);
6681        let result = pgd.minimize(objective, gradient, project, x0);
6682
6683        assert_eq!(result.status, ConvergenceStatus::MaxIterations);
6684        assert_eq!(result.iterations, 3);
6685    }
6686
6687    #[test]
6688    fn test_projected_gd_unconstrained_equivalent() {
6689        // When projection is identity, should behave like gradient descent
6690        let c = Vector::from_slice(&[1.0, 2.0]);
6691
6692        let objective = |x: &Vector<f32>| {
6693            let mut obj = 0.0;
6694            for i in 0..x.len() {
6695                let diff = x[i] - c[i];
6696                obj += 0.5 * diff * diff;
6697            }
6698            obj
6699        };
6700
6701        let gradient = |x: &Vector<f32>| {
6702            let mut grad = Vector::zeros(x.len());
6703            for i in 0..x.len() {
6704                grad[i] = x[i] - c[i];
6705            }
6706            grad
6707        };
6708
6709        let project = |x: &Vector<f32>| x.clone(); // Identity projection
6710
6711        let mut pgd = ProjectedGradientDescent::new(1000, 0.1, 1e-6);
6712        let x0 = Vector::zeros(2);
6713        let result = pgd.minimize(objective, gradient, project, x0);
6714
6715        assert_eq!(result.status, ConvergenceStatus::Converged);
6716        assert!((result.solution[0] - 1.0).abs() < 1e-4);
6717        assert!((result.solution[1] - 2.0).abs() < 1e-4);
6718    }
6719
6720    // ==================== Augmented Lagrangian Tests ====================
6721
6722    #[test]
6723    fn test_augmented_lagrangian_linear_equality() {
6724        // Minimize: ½(x₁-2)² + ½(x₂-3)² subject to x₁ + x₂ = 1
6725        // Analytical solution: x = [2, 3] - λ[1, 1] where x₁+x₂=1
6726        // Solving: 2-λ + 3-λ = 1 → λ = 2, so x = [0, 1]
6727
6728        let objective = |x: &Vector<f32>| 0.5 * (x[0] - 2.0).powi(2) + 0.5 * (x[1] - 3.0).powi(2);
6729
6730        let gradient = |x: &Vector<f32>| Vector::from_slice(&[x[0] - 2.0, x[1] - 3.0]);
6731
6732        let equality = |x: &Vector<f32>| Vector::from_slice(&[x[0] + x[1] - 1.0]);
6733
6734        let equality_jac = |_x: &Vector<f32>| vec![Vector::from_slice(&[1.0, 1.0])];
6735
6736        let mut al = AugmentedLagrangian::new(100, 1e-4, 1.0);
6737        let x0 = Vector::zeros(2);
6738        let result = al.minimize_equality(objective, gradient, equality, equality_jac, x0);
6739
6740        // Check constraint satisfaction
6741        assert!(result.constraint_violation < 1e-3);
6742        // Check that x₁ + x₂ ≈ 1
6743        assert!((result.solution[0] + result.solution[1] - 1.0).abs() < 1e-3);
6744    }
6745
6746    #[test]
6747    fn test_augmented_lagrangian_multiple_constraints() {
6748        // Minimize: ½‖x‖² subject to x₁ + x₂ = 1, x₁ - x₂ = 0
6749        // This means x₁ = x₂ and x₁ + x₂ = 1, so x = [0.5, 0.5]
6750
6751        let objective = |x: &Vector<f32>| 0.5 * (x[0] * x[0] + x[1] * x[1]);
6752
6753        let gradient = |x: &Vector<f32>| Vector::from_slice(&[x[0], x[1]]);
6754
6755        let equality = |x: &Vector<f32>| Vector::from_slice(&[x[0] + x[1] - 1.0, x[0] - x[1]]);
6756
6757        let equality_jac = |_x: &Vector<f32>| {
6758            vec![
6759                Vector::from_slice(&[1.0, 1.0]),
6760                Vector::from_slice(&[1.0, -1.0]),
6761            ]
6762        };
6763
6764        let mut al = AugmentedLagrangian::new(200, 1e-4, 1.0);
6765        let x0 = Vector::zeros(2);
6766        let result = al.minimize_equality(objective, gradient, equality, equality_jac, x0);
6767
6768        assert!(result.constraint_violation < 1e-3);
6769        assert!((result.solution[0] - 0.5).abs() < 1e-2);
6770        assert!((result.solution[1] - 0.5).abs() < 1e-2);
6771    }
6772
6773    #[test]
6774    fn test_augmented_lagrangian_3d() {
6775        // Minimize: ½‖x - c‖² subject to x₁ + x₂ + x₃ = 1
6776        let c = Vector::from_slice(&[1.0, 2.0, 3.0]);
6777
6778        let objective = |x: &Vector<f32>| {
6779            0.5 * ((x[0] - c[0]).powi(2) + (x[1] - c[1]).powi(2) + (x[2] - c[2]).powi(2))
6780        };
6781
6782        let gradient =
6783            |x: &Vector<f32>| Vector::from_slice(&[x[0] - c[0], x[1] - c[1], x[2] - c[2]]);
6784
6785        let equality = |x: &Vector<f32>| Vector::from_slice(&[x[0] + x[1] + x[2] - 1.0]);
6786
6787        let equality_jac = |_x: &Vector<f32>| vec![Vector::from_slice(&[1.0, 1.0, 1.0])];
6788
6789        let mut al = AugmentedLagrangian::new(100, 1e-4, 1.0);
6790        let x0 = Vector::zeros(3);
6791        let result = al.minimize_equality(objective, gradient, equality, equality_jac, x0);
6792
6793        assert!(result.constraint_violation < 1e-3);
6794        assert!((result.solution[0] + result.solution[1] + result.solution[2] - 1.0).abs() < 1e-3);
6795    }
6796
6797    #[test]
6798    fn test_augmented_lagrangian_quadratic_with_constraint() {
6799        // Minimize: x₁² + 2x₂² subject to 2x₁ + x₂ = 1
6800        // Lagrangian: L = x₁² + 2x₂² - λ(2x₁ + x₂ - 1)
6801        // KKT: 2x₁ - 2λ = 0, 4x₂ - λ = 0, 2x₁ + x₂ = 1
6802        // Solution: x₁ = λ, x₂ = λ/4, 2λ + λ/4 = 1 → λ = 4/9
6803        // So x = [4/9, 1/9]
6804
6805        let objective = |x: &Vector<f32>| x[0] * x[0] + 2.0 * x[1] * x[1];
6806
6807        let gradient = |x: &Vector<f32>| Vector::from_slice(&[2.0 * x[0], 4.0 * x[1]]);
6808
6809        let equality = |x: &Vector<f32>| Vector::from_slice(&[2.0 * x[0] + x[1] - 1.0]);
6810
6811        let equality_jac = |_x: &Vector<f32>| vec![Vector::from_slice(&[2.0, 1.0])];
6812
6813        let mut al = AugmentedLagrangian::new(150, 1e-4, 1.0);
6814        let x0 = Vector::zeros(2);
6815        let result = al.minimize_equality(objective, gradient, equality, equality_jac, x0);
6816
6817        assert!(result.constraint_violation < 1e-3);
6818        assert!((result.solution[0] - 4.0 / 9.0).abs() < 1e-2);
6819        assert!((result.solution[1] - 1.0 / 9.0).abs() < 1e-2);
6820    }
6821
6822    #[test]
6823    fn test_augmented_lagrangian_convergence_tracking() {
6824        let objective = |x: &Vector<f32>| 0.5 * (x[0] * x[0] + x[1] * x[1]);
6825
6826        let gradient = |x: &Vector<f32>| Vector::from_slice(&[x[0], x[1]]);
6827
6828        let equality = |x: &Vector<f32>| Vector::from_slice(&[x[0] + x[1] - 1.0]);
6829
6830        let equality_jac = |_x: &Vector<f32>| vec![Vector::from_slice(&[1.0, 1.0])];
6831
6832        let mut al = AugmentedLagrangian::new(100, 1e-4, 1.0);
6833        let x0 = Vector::zeros(2);
6834        let result = al.minimize_equality(objective, gradient, equality, equality_jac, x0);
6835
6836        assert_eq!(result.status, ConvergenceStatus::Converged);
6837        assert!(result.iterations > 0);
6838        assert!(result.elapsed_time.as_nanos() > 0);
6839        assert!(result.constraint_violation < 1e-3);
6840    }
6841
6842    #[test]
6843    fn test_augmented_lagrangian_rho_adaptation() {
6844        // Test with custom rho increase factor
6845        let objective = |x: &Vector<f32>| 0.5 * (x[0] * x[0] + x[1] * x[1]);
6846
6847        let gradient = |x: &Vector<f32>| Vector::from_slice(&[x[0], x[1]]);
6848
6849        let equality = |x: &Vector<f32>| Vector::from_slice(&[x[0] + x[1] - 1.0]);
6850
6851        let equality_jac = |_x: &Vector<f32>| vec![Vector::from_slice(&[1.0, 1.0])];
6852
6853        let mut al = AugmentedLagrangian::new(200, 1e-4, 1.0).with_rho_increase(3.0);
6854        let x0 = Vector::zeros(2);
6855        let result = al.minimize_equality(objective, gradient, equality, equality_jac, x0);
6856
6857        assert!(result.constraint_violation < 1e-2); // Relaxed tolerance for high rho_increase
6858    }
6859
6860    #[test]
6861    fn test_augmented_lagrangian_max_iterations() {
6862        // Use very few iterations to force max iterations status
6863        let objective = |x: &Vector<f32>| 0.5 * (x[0] * x[0] + x[1] * x[1]);
6864
6865        let gradient = |x: &Vector<f32>| Vector::from_slice(&[x[0], x[1]]);
6866
6867        let equality = |x: &Vector<f32>| Vector::from_slice(&[x[0] + x[1] - 1.0]);
6868
6869        let equality_jac = |_x: &Vector<f32>| vec![Vector::from_slice(&[1.0, 1.0])];
6870
6871        let mut al = AugmentedLagrangian::new(2, 1e-10, 1.0); // Very few iterations
6872        let x0 = Vector::zeros(2);
6873        let result = al.minimize_equality(objective, gradient, equality, equality_jac, x0);
6874
6875        assert_eq!(result.status, ConvergenceStatus::MaxIterations);
6876        assert_eq!(result.iterations, 2);
6877    }
6878
6879    // ==================== Interior Point Tests ====================
6880
6881    #[test]
6882    fn test_interior_point_nonnegative() {
6883        // Minimize: x₁² + x₂² subject to -x₁ ≤ 0, -x₂ ≤ 0 (i.e., x ≥ 0)
6884        // Solution: x = [0, 0]
6885
6886        let objective = |x: &Vector<f32>| x[0] * x[0] + x[1] * x[1];
6887
6888        let gradient = |x: &Vector<f32>| Vector::from_slice(&[2.0 * x[0], 2.0 * x[1]]);
6889
6890        // Inequality constraints: g(x) = [-x₁, -x₂] ≤ 0
6891        let inequality = |x: &Vector<f32>| Vector::from_slice(&[-x[0], -x[1]]);
6892
6893        let inequality_jac = |_x: &Vector<f32>| {
6894            vec![
6895                Vector::from_slice(&[-1.0, 0.0]),
6896                Vector::from_slice(&[0.0, -1.0]),
6897            ]
6898        };
6899
6900        let mut ip = InteriorPoint::new(80, 1e-5, 1.0);
6901        let x0 = Vector::from_slice(&[0.5, 0.5]); // Interior feasible start
6902        let result = ip.minimize(objective, gradient, inequality, inequality_jac, x0);
6903
6904        // Solution approaches [0, 0] as barrier parameter decreases
6905        assert!(result.solution[0].abs() < 0.2);
6906        assert!(result.solution[1].abs() < 0.2);
6907        assert!(result.constraint_violation <= 0.0); // All constraints satisfied
6908    }
6909
6910    #[test]
6911    fn test_interior_point_box_constraints() {
6912        // Minimize: (x₁-0.8)² + (x₂-0.8)² subject to 0 ≤ x ≤ 1
6913        // Target is inside the box, so solution should approach [0.8, 0.8]
6914
6915        let objective = |x: &Vector<f32>| (x[0] - 0.8).powi(2) + (x[1] - 0.8).powi(2);
6916
6917        let gradient =
6918            |x: &Vector<f32>| Vector::from_slice(&[2.0 * (x[0] - 0.8), 2.0 * (x[1] - 0.8)]);
6919
6920        // g(x) = [-x₁, -x₂, x₁-1, x₂-1] ≤ 0
6921        let inequality =
6922            |x: &Vector<f32>| Vector::from_slice(&[-x[0], -x[1], x[0] - 1.0, x[1] - 1.0]);
6923
6924        let inequality_jac = |_x: &Vector<f32>| {
6925            vec![
6926                Vector::from_slice(&[-1.0, 0.0]),
6927                Vector::from_slice(&[0.0, -1.0]),
6928                Vector::from_slice(&[1.0, 0.0]),
6929                Vector::from_slice(&[0.0, 1.0]),
6930            ]
6931        };
6932
6933        let mut ip = InteriorPoint::new(80, 1e-4, 1.0);
6934        let x0 = Vector::from_slice(&[0.5, 0.5]); // Feasible start (interior)
6935        let result = ip.minimize(objective, gradient, inequality, inequality_jac, x0);
6936
6937        // Solution should be within box [0,1]×[0,1]
6938        assert!(result.solution[0] >= 0.0 && result.solution[0] <= 1.0);
6939        assert!(result.solution[1] >= 0.0 && result.solution[1] <= 1.0);
6940        assert!(result.constraint_violation <= 0.0);
6941    }
6942
6943    #[test]
6944    fn test_interior_point_linear_constraint() {
6945        // Minimize: x₁² + x₂² subject to x₁ + x₂ ≤ 2
6946        // Solution is interior or on boundary
6947
6948        let objective = |x: &Vector<f32>| x[0] * x[0] + x[1] * x[1];
6949
6950        let gradient = |x: &Vector<f32>| Vector::from_slice(&[2.0 * x[0], 2.0 * x[1]]);
6951
6952        // g(x) = [x₁ + x₂ - 2] ≤ 0
6953        let inequality = |x: &Vector<f32>| Vector::from_slice(&[x[0] + x[1] - 2.0]);
6954
6955        let inequality_jac = |_x: &Vector<f32>| vec![Vector::from_slice(&[1.0, 1.0])];
6956
6957        let mut ip = InteriorPoint::new(80, 1e-5, 1.0);
6958        let x0 = Vector::from_slice(&[0.5, 0.5]); // Feasible start
6959        let result = ip.minimize(objective, gradient, inequality, inequality_jac, x0);
6960
6961        // Solution approaches [1, 1] on boundary or stays interior
6962        assert!(result.solution[0] + result.solution[1] <= 2.1);
6963        assert!(result.constraint_violation <= 0.0);
6964    }
6965
6966    #[test]
6967    fn test_interior_point_3d() {
6968        // Minimize: ‖x‖² subject to x₁ + x₂ + x₃ ≤ 1, x ≥ 0
6969
6970        let objective = |x: &Vector<f32>| x[0] * x[0] + x[1] * x[1] + x[2] * x[2];
6971
6972        let gradient = |x: &Vector<f32>| Vector::from_slice(&[2.0 * x[0], 2.0 * x[1], 2.0 * x[2]]);
6973
6974        // g(x) = [x₁+x₂+x₃-1, -x₁, -x₂, -x₃] ≤ 0
6975        let inequality =
6976            |x: &Vector<f32>| Vector::from_slice(&[x[0] + x[1] + x[2] - 1.0, -x[0], -x[1], -x[2]]);
6977
6978        let inequality_jac = |_x: &Vector<f32>| {
6979            vec![
6980                Vector::from_slice(&[1.0, 1.0, 1.0]),
6981                Vector::from_slice(&[-1.0, 0.0, 0.0]),
6982                Vector::from_slice(&[0.0, -1.0, 0.0]),
6983                Vector::from_slice(&[0.0, 0.0, -1.0]),
6984            ]
6985        };
6986
6987        let mut ip = InteriorPoint::new(80, 1e-5, 1.0);
6988        let x0 = Vector::from_slice(&[0.2, 0.2, 0.2]); // Feasible start
6989        let result = ip.minimize(objective, gradient, inequality, inequality_jac, x0);
6990
6991        // Solution moves toward origin while satisfying constraints
6992        assert!(result.solution[0] + result.solution[1] + result.solution[2] <= 1.1);
6993        assert!(result.solution[0] >= -0.1);
6994        assert!(result.solution[1] >= -0.1);
6995        assert!(result.solution[2] >= -0.1);
6996    }
6997
6998    #[test]
6999    fn test_interior_point_convergence_tracking() {
7000        let objective = |x: &Vector<f32>| x[0] * x[0] + x[1] * x[1];
7001
7002        let gradient = |x: &Vector<f32>| Vector::from_slice(&[2.0 * x[0], 2.0 * x[1]]);
7003
7004        let inequality = |x: &Vector<f32>| Vector::from_slice(&[-x[0], -x[1]]);
7005
7006        let inequality_jac = |_x: &Vector<f32>| {
7007            vec![
7008                Vector::from_slice(&[-1.0, 0.0]),
7009                Vector::from_slice(&[0.0, -1.0]),
7010            ]
7011        };
7012
7013        let mut ip = InteriorPoint::new(50, 1e-6, 1.0);
7014        let x0 = Vector::from_slice(&[1.0, 1.0]);
7015        let result = ip.minimize(objective, gradient, inequality, inequality_jac, x0);
7016
7017        assert!(result.iterations > 0);
7018        assert!(result.elapsed_time.as_nanos() > 0);
7019        assert!(result.constraint_violation <= 0.0);
7020    }
7021
7022    #[test]
7023    fn test_interior_point_beta_parameter() {
7024        // Test with custom beta (barrier decrease factor)
7025        let objective = |x: &Vector<f32>| x[0] * x[0] + x[1] * x[1];
7026
7027        let gradient = |x: &Vector<f32>| Vector::from_slice(&[2.0 * x[0], 2.0 * x[1]]);
7028
7029        let inequality = |x: &Vector<f32>| Vector::from_slice(&[-x[0], -x[1]]);
7030
7031        let inequality_jac = |_x: &Vector<f32>| {
7032            vec![
7033                Vector::from_slice(&[-1.0, 0.0]),
7034                Vector::from_slice(&[0.0, -1.0]),
7035            ]
7036        };
7037
7038        let mut ip = InteriorPoint::new(50, 1e-6, 1.0).with_beta(0.1);
7039        let x0 = Vector::from_slice(&[1.0, 1.0]);
7040        let result = ip.minimize(objective, gradient, inequality, inequality_jac, x0);
7041
7042        assert!(result.solution[0].abs() < 1e-1);
7043        assert!(result.solution[1].abs() < 1e-1);
7044    }
7045
7046    #[test]
7047    #[should_panic(expected = "Initial point is infeasible")]
7048    fn test_interior_point_infeasible_start() {
7049        // Test that infeasible initial point panics
7050        let objective = |x: &Vector<f32>| x[0] * x[0] + x[1] * x[1];
7051
7052        let gradient = |x: &Vector<f32>| Vector::from_slice(&[2.0 * x[0], 2.0 * x[1]]);
7053
7054        let inequality = |x: &Vector<f32>| Vector::from_slice(&[-x[0], -x[1]]);
7055
7056        let inequality_jac = |_x: &Vector<f32>| {
7057            vec![
7058                Vector::from_slice(&[-1.0, 0.0]),
7059                Vector::from_slice(&[0.0, -1.0]),
7060            ]
7061        };
7062
7063        let mut ip = InteriorPoint::new(50, 1e-6, 1.0);
7064        let x0 = Vector::from_slice(&[-1.0, 1.0]); // INFEASIBLE! x₁ < 0
7065        ip.minimize(objective, gradient, inequality, inequality_jac, x0);
7066    }
7067
7068    #[test]
7069    fn test_interior_point_max_iterations() {
7070        let objective = |x: &Vector<f32>| x[0] * x[0] + x[1] * x[1];
7071
7072        let gradient = |x: &Vector<f32>| Vector::from_slice(&[2.0 * x[0], 2.0 * x[1]]);
7073
7074        let inequality = |x: &Vector<f32>| Vector::from_slice(&[-x[0], -x[1]]);
7075
7076        let inequality_jac = |_x: &Vector<f32>| {
7077            vec![
7078                Vector::from_slice(&[-1.0, 0.0]),
7079                Vector::from_slice(&[0.0, -1.0]),
7080            ]
7081        };
7082
7083        let mut ip = InteriorPoint::new(2, 1e-10, 1.0); // Very few iterations
7084        let x0 = Vector::from_slice(&[1.0, 1.0]);
7085        let result = ip.minimize(objective, gradient, inequality, inequality_jac, x0);
7086
7087        assert_eq!(result.status, ConvergenceStatus::MaxIterations);
7088        assert_eq!(result.iterations, 2);
7089    }
7090}