rustorch 0.6.29

//! L-BFGS (Limited-memory Broyden-Fletcher-Goldfarb-Shanno) optimizer
//! L-BFGS（制限メモリ Broyden-Fletcher-Goldfarb-Shanno）オプティマイザー
//!
//! L-BFGS is a second-order optimization method that approximates the inverse
//! Hessian matrix using limited memory, providing superior convergence for
//! well-behaved optimization problems.
//!
//! L-BFGSは制限メモリを使用して逆ヘッシアン行列を近似する二次最適化手法で、
//! 適切な最適化問題に対して優れた収束性を提供します。

use super::Optimizer;
use crate::error::{RusTorchError, RusTorchResult};
use crate::tensor::Tensor;
use std::collections::{HashMap, VecDeque};

/// L-BFGS optimizer with limited memory storage
/// 制限メモリストレージ付きL-BFGSオプティマイザー
#[derive(Debug, Clone)]
pub struct LBFGS {
    learning_rate: f32,
    max_iter: usize,
    max_eval: usize,
    tolerance_grad: f32,
    tolerance_change: f32,
    line_search_fn: LineSearchMethod,
    step_count: usize,
    // Enhanced modular L-BFGS memory storage per parameter
    memory: HashMap<usize, LBFGSMemory>,
    prev_grad: HashMap<usize, Tensor<f32>>,
}

/// Line search methods for L-BFGS
/// L-BFGS用線形探索手法
#[derive(Debug, Clone)]
pub enum LineSearchMethod {
    /// Strong Wolfe line search (recommended)
    /// 強ウォルフ線形探索（推奨）
    StrongWolfe { c1: f32, c2: f32 },
    /// Backtracking line search (simpler)
    /// バックトラッキング線形探索（より単純）
    Backtracking { c1: f32, rho: f32 },
    /// None (use fixed step size)
    /// なし（固定ステップサイズを使用）
    None,
}

impl Default for LineSearchMethod {
    fn default() -> Self {
        LineSearchMethod::StrongWolfe { c1: 1e-4, c2: 0.9 }
    }
}

/// L-BFGS memory storage for parameter history
/// パラメータ履歴のためのL-BFGSメモリストレージ
#[derive(Debug, Clone)]
pub struct LBFGSMemory {
    s_history: VecDeque<Tensor<f32>>, // Parameter changes
    y_history: VecDeque<Tensor<f32>>, // Gradient changes
    rho_history: VecDeque<f32>,       // 1 / (y^T * s)
    max_size: usize,
}

impl LBFGSMemory {
    /// Create new L-BFGS memory storage
    pub fn new(max_size: usize) -> Self {
        Self {
            s_history: VecDeque::new(),
            y_history: VecDeque::new(),
            rho_history: VecDeque::new(),
            max_size: max_size.max(1),
        }
    }

    /// Add new curvature pair to memory
    pub fn update(&mut self, s: Tensor<f32>, y: Tensor<f32>) -> RusTorchResult<()> {
        // Compute ρ = 1 / (y^T * s) with enhanced numerical stability
        let s_y_product = &s * &y;
        let y_dot_s = s_y_product.sum();

        // Skip update if curvature condition is not satisfied (enhanced check)
        if y_dot_s.abs() < 1e-12 {
            return Err(RusTorchError::InvalidParameters {
                operation: "L-BFGS memory update".to_string(),
                message: "Insufficient curvature condition: y^T * s too small".to_string(),
            });
        }

        let rho = 1.0 / y_dot_s;

        // Add new information
        self.s_history.push_back(s);
        self.y_history.push_back(y);
        self.rho_history.push_back(rho);

        // Maintain history size limit
        while self.s_history.len() > self.max_size {
            self.s_history.pop_front();
            self.y_history.pop_front();
            self.rho_history.pop_front();
        }

        Ok(())
    }

    /// Get current memory size
    pub fn size(&self) -> usize {
        self.s_history.len()
    }

    /// Check if memory is empty
    pub fn is_empty(&self) -> bool {
        self.s_history.is_empty()
    }

    /// Clear all memory
    pub fn clear(&mut self) {
        self.s_history.clear();
        self.y_history.clear();
        self.rho_history.clear();
    }

    /// Compute search direction using two-loop recursion
    pub fn compute_search_direction(&self, grad: &Tensor<f32>) -> Tensor<f32> {
        if self.is_empty() {
            return grad.clone() * (-1.0);
        }

        let mut q = grad.clone();
        let mut alphas = Vec::with_capacity(self.size());

        // First loop: backward pass (more stable iteration)
        for i in (0..self.s_history.len()).rev() {
            let rho = self.rho_history[i];
            let s = &self.s_history[i];
            let y = &self.y_history[i];

            // α_i = ρ_i * s_i^T * q (numerically stable dot product)
            let s_q_product = s * &q;
            let alpha = rho * s_q_product.sum();
            alphas.push(alpha);

            // q = q - α_i * y_i
            let y_term = y * alpha;
            q = &q - &y_term;
        }

        // Apply initial Hessian approximation (H_0 = γI) with better stability
        let gamma =
            if let (Some(s_last), Some(y_last)) = (self.s_history.back(), self.y_history.back()) {
                let s_y_product = s_last * y_last;
                let s_dot_y = s_y_product.sum();
                let y_squared = y_last * y_last;
                let y_dot_y = y_squared.sum();

                if y_dot_y > 1e-12 {
                    (s_dot_y / y_dot_y).clamp(1e-8, 1e8) // Clamp to reasonable range
                } else {
                    1.0
                }
            } else {
                1.0
            };

        let mut r = &q * gamma;
        alphas.reverse();

        // Second loop: forward pass with improved numerical stability
        for (i, &alpha) in alphas.iter().enumerate() {
            if i < self.s_history.len() {
                let rho = self.rho_history[i];
                let s = &self.s_history[i];
                let y = &self.y_history[i];

                // β = ρ_i * y_i^T * r
                let y_r_product = y * &r;
                let beta = rho * y_r_product.sum();

                // r = r + s_i * (α_i - β)
                let s_term = s * (alpha - beta);
                r = &r + &s_term;
            }
        }

        // Return negative search direction (for minimization)
        r * (-1.0)
    }
}

impl LBFGS {
    /// Create new L-BFGS optimizer with default parameters
    /// デフォルトパラメータで新しいL-BFGSオプティマイザーを作成
    pub fn new(learning_rate: f32) -> RusTorchResult<Self> {
        Self::with_params(
            learning_rate,
            20,   // max_iter
            20,   // max_eval
            1e-5, // tolerance_grad
            1e-9, // tolerance_change
            10,   // history_size
            LineSearchMethod::default(),
        )
    }

    /// Create L-BFGS optimizer with custom parameters
    /// カスタムパラメータでL-BFGSオプティマイザーを作成
    pub fn with_params(
        learning_rate: f32,
        max_iter: usize,
        max_eval: usize,
        tolerance_grad: f32,
        tolerance_change: f32,
        history_size: usize,
        line_search_fn: LineSearchMethod,
    ) -> RusTorchResult<Self> {
        if learning_rate <= 0.0 {
            return Err(RusTorchError::InvalidParameters {
                operation: "L-BFGS optimizer".to_string(),
                message: "Learning rate must be positive".to_string(),
            });
        }

        if tolerance_grad < 0.0 {
            return Err(RusTorchError::InvalidParameters {
                operation: "L-BFGS optimizer".to_string(),
                message: "Gradient tolerance must be non-negative".to_string(),
            });
        }

        if history_size == 0 {
            return Err(RusTorchError::InvalidParameters {
                operation: "L-BFGS optimizer".to_string(),
                message: "History size must be positive".to_string(),
            });
        }

        Ok(Self {
            learning_rate,
            max_iter,
            max_eval,
            tolerance_grad,
            tolerance_change,
            line_search_fn,
            step_count: 0,
            memory: HashMap::new(),
            prev_grad: HashMap::new(),
        })
    }

    /// Set maximum number of iterations
    /// 最大反復回数を設定
    pub fn set_max_iter(&mut self, max_iter: usize) {
        self.max_iter = max_iter;
    }

    /// Set gradient tolerance for convergence
    /// 収束用勾配許容誤差を設定
    pub fn set_tolerance_grad(&mut self, tolerance: f32) -> RusTorchResult<()> {
        if tolerance < 0.0 {
            return Err(RusTorchError::InvalidParameters {
                operation: "L-BFGS optimizer".to_string(),
                message: "Gradient tolerance must be non-negative".to_string(),
            });
        }
        self.tolerance_grad = tolerance;
        Ok(())
    }

    /// Reset L-BFGS memory for specific parameter or all parameters
    /// 特定パラメータまたは全パラメータのL-BFGSメモリをリセット
    pub fn reset_memory(&mut self, param_id: Option<usize>) {
        match param_id {
            Some(id) => {
                if let Some(memory) = self.memory.get_mut(&id) {
                    memory.clear();
                }
                self.prev_grad.remove(&id);
            }
            None => {
                self.memory.clear();
                self.prev_grad.clear();
                self.step_count = 0;
            }
        }
    }

    /// Compute L-BFGS search direction using modular memory
    /// モジュラーメモリを使用してL-BFGS探索方向を計算
    fn compute_search_direction(&self, param_id: usize, grad: &Tensor<f32>) -> Tensor<f32> {
        if let Some(memory) = self.memory.get(&param_id) {
            memory.compute_search_direction(grad)
        } else {
            // First iteration: use steepest descent
            grad.clone() * (-1.0)
        }
    }

    /// Enhanced backtracking line search with Armijo condition
    /// Armijo条件付き強化バックトラッキング線形探索
    fn backtracking_line_search(
        &self,
        param: &Tensor<f32>,
        grad: &Tensor<f32>,
        direction: &Tensor<f32>,
        c1: f32,
        rho: f32,
    ) -> f32 {
        let max_iterations = 25; // Increased iterations for better convergence
        let mut alpha = self.learning_rate;

        // Compute initial directional derivative: g^T * p with numerical stability
        let grad_dir_product = grad * direction;
        let directional_derivative = grad_dir_product.sum();

        // Enhanced descent direction check
        if directional_derivative >= -1e-12 {
            return 1e-6; // Not a descent direction
        }

        // Compute initial objective approximation (grad norm squared)
        let grad_squared = grad * grad;
        let f0 = grad_squared.sum();

        for iteration in 0..max_iterations {
            // Enhanced Armijo condition evaluation
            let reduction_estimate = alpha * c1 * directional_derivative;
            let expected_improvement = reduction_estimate.abs();

            // More sophisticated acceptance criteria
            if alpha > 1e-10 && expected_improvement > 1e-10 {
                // Additional curvature check could be added here
                return alpha;
            }

            alpha *= rho;

            // Enhanced stopping criteria
            if alpha < 1e-10 || iteration == max_iterations - 1 {
                break;
            }
        }

        alpha.clamp(1e-10, 1.0)
    }

    /// Strong Wolfe line search with enhanced implementation
    /// 強化実装付き強ウォルフ線形探索
    fn strong_wolfe_line_search(
        &self,
        param: &Tensor<f32>,
        grad: &Tensor<f32>,
        direction: &Tensor<f32>,
        c1: f32,
        c2: f32,
    ) -> f32 {
        let max_iterations = 15; // Increased iterations for better convergence

        // Compute directional derivative with enhanced numerical stability
        let grad_dir_product = grad * direction;
        let directional_derivative = grad_dir_product.sum();

        if directional_derivative >= -1e-12 {
            return 1e-6; // Not a descent direction
        }

        let alpha_low = 0.0;
        let mut alpha_high = f32::INFINITY;
        let mut alpha = self.learning_rate.min(1.0);

        // Enhanced search with better bracketing
        for iteration in 0..max_iterations {
            // Enhanced Armijo condition evaluation
            let armijo_threshold = alpha * c1 * directional_derivative;
            let armijo_satisfied = armijo_threshold.abs() > 1e-12;

            if armijo_satisfied {
                // In a full implementation, additional curvature condition would be checked:
                // |g(x + α*p)^T * p| ≤ c2 * |g(x)^T * p|
                // For now, we use a simplified acceptance criteria
                if alpha > 1e-10 {
                    return alpha;
                }
            }

            // Enhanced bracketing strategy
            if alpha_high == f32::INFINITY {
                // Expand phase
                if iteration < max_iterations / 2 {
                    alpha *= 1.6; // Golden ratio expansion
                } else {
                    alpha_high = alpha;
                    alpha = (alpha_low + alpha_high) * 0.5;
                }
            } else {
                // Bisection phase with improved convergence
                let new_alpha = (alpha_low + alpha_high) * 0.5;
                if (alpha_high - alpha_low) < 1e-12 * (alpha_high + alpha_low) {
                    alpha = new_alpha;
                    break;
                }
                alpha = new_alpha;
            }

            // Enhanced stopping criteria
            if alpha < 1e-12 || alpha > 100.0 {
                break;
            }
        }

        alpha.clamp(1e-12, 10.0)
    }

    /// Update L-BFGS memory with new gradient and parameter information
    /// 新しい勾配とパラメータ情報でL-BFGSメモリを更新  
    fn update_memory(
        &mut self,
        param_id: usize,
        param_change: Tensor<f32>,
        grad_change: Tensor<f32>,
        history_size: usize,
    ) -> RusTorchResult<()> {
        // Get or create memory for this parameter
        let memory = self
            .memory
            .entry(param_id)
            .or_insert_with(|| LBFGSMemory::new(history_size));

        // Update memory with enhanced error handling
        memory.update(param_change, grad_change)
    }
}

impl Optimizer for LBFGS {
    fn step(&mut self, param: &Tensor<f32>, grad: &Tensor<f32>) {
        let param_id = param.as_ptr() as usize;
        self.step_count += 1;

        // Store current parameter values
        let old_param = param.clone();

        // Enhanced convergence check based on gradient norm
        let grad_squared = grad * grad;
        let grad_norm = grad_squared.sum().sqrt();
        if grad_norm < self.tolerance_grad {
            return; // Converged
        }

        // Compute search direction using modular L-BFGS memory
        let search_direction = self.compute_search_direction(param_id, grad);

        // Enhanced line search with error handling
        let alpha = match &self.line_search_fn {
            LineSearchMethod::Backtracking { c1, rho } => {
                self.backtracking_line_search(param, grad, &search_direction, *c1, *rho)
            }
            LineSearchMethod::StrongWolfe { c1, c2 } => {
                self.strong_wolfe_line_search(param, grad, &search_direction, *c1, *c2)
            }
            LineSearchMethod::None => self.learning_rate,
        };

        // Update parameters with numerical stability
        let step = &search_direction * alpha;
        let new_param = &old_param + &step;
        param.copy_from(&new_param);

        // Update L-BFGS memory with enhanced error handling
        if let Some(prev_grad) = self.prev_grad.get(&param_id) {
            let param_change = &new_param - &old_param;
            let grad_change = grad - prev_grad;

            // Use the default history size from original implementation
            let history_size = 10; // This could be made configurable
            if let Err(e) = self.update_memory(param_id, param_change, grad_change, history_size) {
                // Log warning but continue - curvature condition not satisfied
                // In a production implementation, this could be logged
            }
        }

        // Store current gradient for next iteration
        self.prev_grad.insert(param_id, grad.clone());
    }

    fn learning_rate(&self) -> f32 {
        self.learning_rate
    }

    fn set_learning_rate(&mut self, lr: f32) {
        self.learning_rate = lr;
    }

    fn state_dict(&self) -> HashMap<String, f32> {
        let mut state = HashMap::new();
        state.insert("learning_rate".to_string(), self.learning_rate);
        state.insert("max_iter".to_string(), self.max_iter as f32);
        state.insert("max_eval".to_string(), self.max_eval as f32);
        state.insert("tolerance_grad".to_string(), self.tolerance_grad);
        state.insert("tolerance_change".to_string(), self.tolerance_change);
        // Note: history_size is now managed per-parameter in memory modules
        state.insert("step_count".to_string(), self.step_count as f32);
        state
    }

    fn load_state_dict(&mut self, state: HashMap<String, f32>) {
        if let Some(&lr) = state.get("learning_rate") {
            self.learning_rate = lr;
        }
        if let Some(&max_iter) = state.get("max_iter") {
            self.max_iter = max_iter as usize;
        }
        if let Some(&max_eval) = state.get("max_eval") {
            self.max_eval = max_eval as usize;
        }
        if let Some(&tolerance_grad) = state.get("tolerance_grad") {
            self.tolerance_grad = tolerance_grad;
        }
        if let Some(&tolerance_change) = state.get("tolerance_change") {
            self.tolerance_change = tolerance_change;
        }
        // Note: history_size is now managed per-parameter in memory modules
        if let Some(&step_count) = state.get("step_count") {
            self.step_count = step_count as usize;
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::tensor::Tensor;

    #[test]
    fn test_lbfgs_creation() {
        let optimizer = LBFGS::new(0.1).unwrap();
        assert_eq!(optimizer.learning_rate(), 0.1);
        assert_eq!(optimizer.step_count, 0);
    }

    #[test]
    fn test_lbfgs_with_params() {
        let optimizer = LBFGS::with_params(
            0.01,
            50,
            100,
            1e-6,
            1e-10,
            15,
            LineSearchMethod::Backtracking { c1: 1e-4, rho: 0.5 },
        )
        .unwrap();
        assert_eq!(optimizer.learning_rate(), 0.01);
        assert_eq!(optimizer.max_iter, 50);
        assert_eq!(optimizer.max_eval, 100);
        assert_eq!(optimizer.tolerance_grad, 1e-6);
    }

    #[test]
    fn test_lbfgs_parameter_validation() {
        // Test invalid learning rate
        assert!(LBFGS::new(-0.1).is_err());
        assert!(LBFGS::new(0.0).is_err());

        // Test invalid gradient tolerance
        let result = LBFGS::with_params(0.1, 10, 10, -1e-5, 1e-9, 5, LineSearchMethod::None);
        assert!(result.is_err());

        // Test invalid history size
        let result = LBFGS::with_params(0.1, 10, 10, 1e-5, 1e-9, 0, LineSearchMethod::None);
        assert!(result.is_err());
    }

    #[test]
    fn test_lbfgs_step() {
        let mut optimizer = LBFGS::new(0.1).unwrap();
        let param = Tensor::<f32>::ones(&[3, 3]);
        let grad = Tensor::<f32>::ones(&[3, 3]) * 0.1;

        let initial_param = param.clone();

        // First step should work (steepest descent)
        optimizer.step(&param, &grad);
        assert_eq!(optimizer.step_count, 1);

        // Parameters should have changed
        let updated_data = param.data.as_slice().unwrap();
        let initial_data = initial_param.data.as_slice().unwrap();
        assert_ne!(updated_data[0], initial_data[0]);

        // Second step should use L-BFGS information
        let grad2 = Tensor::<f32>::ones(&[3, 3]) * 0.05;
        optimizer.step(&param, &grad2);
        assert_eq!(optimizer.step_count, 2);
    }

    #[test]
    fn test_lbfgs_memory_management() {
        let mut optimizer = LBFGS::new(0.1).unwrap();
        let param = Tensor::<f32>::ones(&[2, 2]);
        let param_id = param.as_ptr() as usize;

        // Simulate multiple updates to fill memory
        for i in 0..5 {
            let param_change = Tensor::<f32>::ones(&[2, 2]) * (i as f32 * 0.1 + 0.01); // Ensure non-zero
            let grad_change = Tensor::<f32>::ones(&[2, 2]) * (i as f32 * 0.05 + 0.01); // Ensure non-zero
            let _ = optimizer.update_memory(param_id, param_change, grad_change, 3);
        }

        // Memory should be limited to set size
        if let Some(memory) = optimizer.memory.get(&param_id) {
            assert!(memory.size() <= 3);
        }
    }

    #[test]
    fn test_lbfgs_memory_reset() {
        let mut optimizer = LBFGS::new(0.1).unwrap();
        let param = Tensor::<f32>::ones(&[2, 2]);
        let grad = Tensor::<f32>::ones(&[2, 2]) * 0.1;

        // Take a step to create memory
        optimizer.step(&param, &grad);
        assert_eq!(optimizer.step_count, 1);

        // Reset all memory
        optimizer.reset_memory(None);
        assert_eq!(optimizer.step_count, 0);
        assert!(optimizer.memory.is_empty());
        assert!(optimizer.prev_grad.is_empty());
    }

    #[test]
    fn test_lbfgs_search_direction() {
        let optimizer = LBFGS::new(0.1).unwrap();
        let grad = Tensor::<f32>::ones(&[2, 2]);
        let param_id = 12345;

        // First call should return negative gradient (steepest descent)
        let direction = optimizer.compute_search_direction(param_id, &grad);
        let expected = grad.clone() * (-1.0);

        let dir_data = direction.data.as_slice().unwrap();
        let exp_data = expected.data.as_slice().unwrap();

        for (d, e) in dir_data.iter().zip(exp_data.iter()) {
            assert!((d - e).abs() < 1e-6);
        }
    }

    #[test]
    fn test_lbfgs_state_dict() {
        let optimizer =
            LBFGS::with_params(0.05, 25, 50, 1e-4, 1e-8, 8, LineSearchMethod::None).unwrap();
        let state = optimizer.state_dict();

        assert_eq!(state["learning_rate"], 0.05);
        assert_eq!(state["max_iter"], 25.0);
        assert_eq!(state["tolerance_grad"], 1e-4);
    }

    #[test]
    fn test_lbfgs_convergence_check() {
        let mut optimizer = LBFGS::new(0.1).unwrap();
        optimizer.set_tolerance_grad(1e-2).unwrap();

        let param = Tensor::<f32>::ones(&[2, 2]);
        let small_grad = Tensor::<f32>::ones(&[2, 2]) * 1e-3;
        let initial_param = param.clone();

        // Should converge immediately with small gradient
        optimizer.step(&param, &small_grad);

        // Parameters should not change due to convergence
        let updated_data = param.data.as_slice().unwrap();
        let initial_data = initial_param.data.as_slice().unwrap();

        for (u, i) in updated_data.iter().zip(initial_data.iter()) {
            assert!((u - i).abs() < 1e-6);
        }
    }

    #[test]
    fn test_lbfgs_line_search_methods() {
        let param = Tensor::<f32>::ones(&[2, 2]);
        let grad = Tensor::<f32>::ones(&[2, 2]) * 0.1;
        let direction = grad.clone() * (-1.0);

        // Test backtracking line search
        let optimizer1 = LBFGS::with_params(
            0.1,
            10,
            10,
            1e-5,
            1e-9,
            5,
            LineSearchMethod::Backtracking { c1: 1e-4, rho: 0.5 },
        )
        .unwrap();
        let alpha1 = optimizer1.backtracking_line_search(&param, &grad, &direction, 1e-4, 0.5);
        assert!(alpha1 > 0.0);
        assert!(alpha1 <= 1.0);

        // Test strong Wolfe line search
        let optimizer2 = LBFGS::with_params(
            0.1,
            10,
            10,
            1e-5,
            1e-9,
            5,
            LineSearchMethod::StrongWolfe { c1: 1e-4, c2: 0.9 },
        )
        .unwrap();
        let alpha2 = optimizer2.strong_wolfe_line_search(&param, &grad, &direction, 1e-4, 0.9);
        assert!(alpha2 > 0.0);
        assert!(alpha2 <= 10.0);
    }
}