pub mod adam;
pub mod adamw;
pub mod momentum;
pub mod rmsprop;
pub mod sgd;
pub use adam::{minimize_adam, AdamOptions};
pub use adamw::{minimize_adamw, AdamWOptions};
pub use momentum::{minimize_sgd_momentum, MomentumOptions};
pub use rmsprop::{minimize_rmsprop, RMSPropOptions};
pub use sgd::{minimize_sgd, SGDOptions};
use crate::error::OptimizeError;
use crate::unconstrained::result::OptimizeResult;
use scirs2_core::ndarray::{Array1, ArrayView1};
use scirs2_core::random::prelude::*;
#[derive(Debug, Clone, Copy)]
pub enum StochasticMethod {
SGD,
Momentum,
RMSProp,
Adam,
AdamW,
}
#[derive(Debug, Clone)]
pub struct StochasticOptions {
pub learning_rate: f64,
pub max_iter: usize,
pub batch_size: Option<usize>,
pub tol: f64,
pub adaptive_lr: bool,
pub lr_decay: f64,
pub lr_schedule: LearningRateSchedule,
pub gradient_clip: Option<f64>,
pub early_stopping_patience: Option<usize>,
}
impl Default for StochasticOptions {
fn default() -> Self {
Self {
learning_rate: 0.001,
max_iter: 1000,
batch_size: None,
tol: 1e-6,
adaptive_lr: false,
lr_decay: 0.99,
lr_schedule: LearningRateSchedule::Constant,
gradient_clip: None,
early_stopping_patience: None,
}
}
}
#[derive(Debug, Clone)]
pub enum LearningRateSchedule {
Constant,
ExponentialDecay { decay_rate: f64 },
StepDecay {
decay_factor: f64,
decay_steps: usize,
},
LinearDecay,
CosineAnnealing,
InverseTimeDecay { decay_rate: f64 },
}
pub trait DataProvider {
fn num_samples(&self) -> usize;
fn get_batch(&self, indices: &[usize]) -> Vec<f64>;
fn get_full_data(&self) -> Vec<f64>;
}
#[derive(Clone)]
pub struct InMemoryDataProvider {
data: Vec<f64>,
}
impl InMemoryDataProvider {
pub fn new(data: Vec<f64>) -> Self {
Self { data }
}
}
impl DataProvider for InMemoryDataProvider {
fn num_samples(&self) -> usize {
self.data.len()
}
fn get_batch(&self, indices: &[usize]) -> Vec<f64> {
indices.iter().map(|&i| self.data[i]).collect()
}
fn get_full_data(&self) -> Vec<f64> {
self.data.clone()
}
}
pub trait StochasticGradientFunction {
fn compute_gradient(&mut self, x: &ArrayView1<f64>, batchdata: &[f64]) -> Array1<f64>;
fn compute_value(&mut self, x: &ArrayView1<f64>, batchdata: &[f64]) -> f64;
}
pub struct BatchGradientWrapper<F, G> {
func: F,
grad: G,
}
impl<F, G> BatchGradientWrapper<F, G>
where
F: FnMut(&ArrayView1<f64>) -> f64,
G: FnMut(&ArrayView1<f64>) -> Array1<f64>,
{
pub fn new(func: F, grad: G) -> Self {
Self { func, grad }
}
}
impl<F, G> StochasticGradientFunction for BatchGradientWrapper<F, G>
where
F: FnMut(&ArrayView1<f64>) -> f64,
G: FnMut(&ArrayView1<f64>) -> Array1<f64>,
{
fn compute_gradient(&mut self, x: &ArrayView1<f64>, _batchdata: &[f64]) -> Array1<f64> {
(self.grad)(x)
}
fn compute_value(&mut self, x: &ArrayView1<f64>, _batchdata: &[f64]) -> f64 {
(self.func)(x)
}
}
#[allow(dead_code)]
pub fn update_learning_rate(
initial_lr: f64,
epoch: usize,
max_epochs: usize,
schedule: &LearningRateSchedule,
) -> f64 {
match schedule {
LearningRateSchedule::Constant => initial_lr,
LearningRateSchedule::ExponentialDecay { decay_rate } => {
initial_lr * decay_rate.powi(epoch as i32)
}
LearningRateSchedule::StepDecay {
decay_factor,
decay_steps,
} => initial_lr * decay_factor.powi((epoch / decay_steps) as i32),
LearningRateSchedule::LinearDecay => {
initial_lr * (1.0 - epoch as f64 / max_epochs as f64).max(0.0)
}
LearningRateSchedule::CosineAnnealing => {
initial_lr
* 0.5
* (1.0 + (std::f64::consts::PI * epoch as f64 / max_epochs as f64).cos())
}
LearningRateSchedule::InverseTimeDecay { decay_rate } => {
initial_lr / (1.0 + decay_rate * epoch as f64)
}
}
}
#[allow(dead_code)]
pub fn clip_gradients(gradient: &mut Array1<f64>, maxnorm: f64) {
let grad_norm = gradient.mapv(|x| x * x).sum().sqrt();
if grad_norm > maxnorm {
let scale = maxnorm / grad_norm;
gradient.mapv_inplace(|x| x * scale);
}
}
#[allow(dead_code)]
pub fn generate_batch_indices(_num_samples: usize, batchsize: usize, shuffle: bool) -> Vec<usize> {
let mut indices: Vec<usize> = (0.._num_samples).collect();
if shuffle {
use scirs2_core::random::seq::SliceRandom;
indices.shuffle(&mut thread_rng());
}
indices.into_iter().take(batchsize).collect()
}
#[allow(dead_code)]
pub fn minimize_stochastic<F>(
method: StochasticMethod,
grad_func: F,
x0: Array1<f64>,
data_provider: Box<dyn DataProvider>,
options: StochasticOptions,
) -> Result<OptimizeResult<f64>, OptimizeError>
where
F: StochasticGradientFunction,
{
match method {
StochasticMethod::SGD => {
let sgd_options = SGDOptions {
learning_rate: options.learning_rate,
max_iter: options.max_iter,
tol: options.tol,
lr_schedule: options.lr_schedule,
gradient_clip: options.gradient_clip,
batch_size: options.batch_size,
};
sgd::minimize_sgd(grad_func, x0, data_provider, sgd_options)
}
StochasticMethod::Momentum => {
let momentum_options = MomentumOptions {
learning_rate: options.learning_rate,
momentum: 0.9, max_iter: options.max_iter,
tol: options.tol,
lr_schedule: options.lr_schedule,
gradient_clip: options.gradient_clip,
batch_size: options.batch_size,
nesterov: false,
dampening: 0.0,
};
momentum::minimize_sgd_momentum(grad_func, x0, data_provider, momentum_options)
}
StochasticMethod::RMSProp => {
let rmsprop_options = RMSPropOptions {
learning_rate: options.learning_rate,
decay_rate: 0.99, epsilon: 1e-8,
max_iter: options.max_iter,
tol: options.tol,
lr_schedule: options.lr_schedule,
gradient_clip: options.gradient_clip,
batch_size: options.batch_size,
centered: false,
momentum: None,
};
rmsprop::minimize_rmsprop(grad_func, x0, data_provider, rmsprop_options)
}
StochasticMethod::Adam => {
let adam_options = AdamOptions {
learning_rate: options.learning_rate,
beta1: 0.9,
beta2: 0.999,
epsilon: 1e-8,
max_iter: options.max_iter,
tol: options.tol,
lr_schedule: options.lr_schedule,
gradient_clip: options.gradient_clip,
batch_size: options.batch_size,
amsgrad: false,
};
adam::minimize_adam(grad_func, x0, data_provider, adam_options)
}
StochasticMethod::AdamW => {
let adamw_options = AdamWOptions {
learning_rate: options.learning_rate,
beta1: 0.9,
beta2: 0.999,
epsilon: 1e-8,
weight_decay: 0.01, max_iter: options.max_iter,
tol: options.tol,
lr_schedule: options.lr_schedule,
gradient_clip: options.gradient_clip,
batch_size: options.batch_size,
decouple_weight_decay: true,
};
adamw::minimize_adamw(grad_func, x0, data_provider, adamw_options)
}
}
}
#[allow(dead_code)]
pub fn create_stochastic_options_for_problem(
problem_type: &str,
dataset_size: usize,
) -> StochasticOptions {
match problem_type.to_lowercase().as_str() {
"neural_network" => StochasticOptions {
learning_rate: 0.001,
max_iter: 1000,
batch_size: Some(32.min(dataset_size / 10)),
lr_schedule: LearningRateSchedule::ExponentialDecay { decay_rate: 0.99 },
gradient_clip: Some(1.0),
early_stopping_patience: Some(50),
..Default::default()
},
"linear_regression" => StochasticOptions {
learning_rate: 0.01,
max_iter: 500,
batch_size: Some(64.min(dataset_size / 5)),
lr_schedule: LearningRateSchedule::LinearDecay,
..Default::default()
},
"logistic_regression" => StochasticOptions {
learning_rate: 0.01,
max_iter: 200,
batch_size: Some(32.min(dataset_size / 10)),
lr_schedule: LearningRateSchedule::StepDecay {
decay_factor: 0.9,
decay_steps: 50,
},
..Default::default()
},
"large_scale" => StochasticOptions {
learning_rate: 0.001,
max_iter: 2000,
batch_size: Some(128.min(dataset_size / 20)),
lr_schedule: LearningRateSchedule::CosineAnnealing,
gradient_clip: Some(5.0),
adaptive_lr: true,
..Default::default()
},
"noisy_gradients" => StochasticOptions {
learning_rate: 0.01,
max_iter: 1000,
batch_size: Some(64.min(dataset_size / 5)),
lr_schedule: LearningRateSchedule::InverseTimeDecay { decay_rate: 1.0 },
gradient_clip: Some(2.0),
early_stopping_patience: Some(100),
..Default::default()
},
_ => StochasticOptions::default(),
}
}
#[cfg(test)]
mod tests {
use super::*;
use approx::assert_abs_diff_eq;
#[test]
fn test_learning_rate_schedules() {
let initial_lr = 0.1;
let max_epochs = 100;
let constant = LearningRateSchedule::Constant;
assert_abs_diff_eq!(
update_learning_rate(initial_lr, 50, max_epochs, &constant),
initial_lr,
epsilon = 1e-10
);
let exp_decay = LearningRateSchedule::ExponentialDecay { decay_rate: 0.9 };
let lr_exp = update_learning_rate(initial_lr, 10, max_epochs, &exp_decay);
assert_abs_diff_eq!(lr_exp, initial_lr * 0.9_f64.powi(10), epsilon = 1e-10);
let linear = LearningRateSchedule::LinearDecay;
let lr_linear = update_learning_rate(initial_lr, 50, max_epochs, &linear);
assert_abs_diff_eq!(lr_linear, initial_lr * 0.5, epsilon = 1e-10);
}
#[test]
fn test_gradient_clipping() {
let mut grad = Array1::from_vec(vec![3.0, 4.0]); clip_gradients(&mut grad, 2.5);
let clipped_norm = grad.mapv(|x| x * x).sum().sqrt();
assert_abs_diff_eq!(clipped_norm, 2.5, epsilon = 1e-10);
assert_abs_diff_eq!(grad[0] / grad[1], 3.0 / 4.0, epsilon = 1e-10);
}
#[test]
fn test_batch_indices_generation() {
let indices = generate_batch_indices(100, 10, false);
assert_eq!(indices.len(), 10);
assert_eq!(indices, (0..10).collect::<Vec<usize>>());
let shuffled = generate_batch_indices(100, 10, true);
assert_eq!(shuffled.len(), 10);
assert!(shuffled.iter().all(|&i| i < 100));
}
#[test]
fn test_in_memory_data_provider() {
let data = vec![1.0, 2.0, 3.0, 4.0, 5.0];
let provider = InMemoryDataProvider::new(data.clone());
assert_eq!(provider.num_samples(), 5);
assert_eq!(provider.get_full_data(), data);
let batch = provider.get_batch(&[0, 2, 4]);
assert_eq!(batch, vec![1.0, 3.0, 5.0]);
}
#[test]
fn test_problem_specific_options() {
let nn_options = create_stochastic_options_for_problem("neural_network", 1000);
assert_eq!(nn_options.learning_rate, 0.001);
assert!(nn_options.batch_size.is_some());
assert!(nn_options.gradient_clip.is_some());
let lr_options = create_stochastic_options_for_problem("linear_regression", 500);
assert_eq!(lr_options.learning_rate, 0.01);
assert!(matches!(
lr_options.lr_schedule,
LearningRateSchedule::LinearDecay
));
let large_options = create_stochastic_options_for_problem("large_scale", 10000);
assert!(matches!(
large_options.lr_schedule,
LearningRateSchedule::CosineAnnealing
));
assert_eq!(large_options.batch_size, Some(128));
}
}