use crate::{amacp::AMacPConfig, novograd::NovoGradConfig};
use scirs2_core::random::*; use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::time::{Duration, Instant};
use trustformers_core::errors::{Result, TrustformersError};
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct HyperparameterSpace {
pub learning_rate: (f32, f32),
pub beta1: (f32, f32),
pub beta2: (f32, f32),
pub weight_decay: (f32, f32),
pub epsilon: (f32, f32),
pub batch_sizes: Vec<usize>,
pub log_scale_lr: bool,
pub custom_params: HashMap<String, (f32, f32)>,
}
impl Default for HyperparameterSpace {
fn default() -> Self {
Self {
learning_rate: (1e-5, 1e-1),
beta1: (0.8, 0.999),
beta2: (0.9, 0.9999),
weight_decay: (0.0, 1e-1),
epsilon: (1e-10, 1e-6),
batch_sizes: vec![16, 32, 64, 128, 256],
log_scale_lr: true,
custom_params: HashMap::new(),
}
}
}
impl HyperparameterSpace {
pub fn for_transformers() -> Self {
Self {
learning_rate: (1e-5, 5e-3),
beta1: (0.85, 0.95),
beta2: (0.95, 0.999),
weight_decay: (1e-3, 1e-1),
epsilon: (1e-8, 1e-6),
batch_sizes: vec![32, 64, 128, 256],
log_scale_lr: true,
custom_params: [
("warmup_steps".to_string(), (1000.0, 10000.0)),
("max_grad_norm".to_string(), (0.5, 2.0)),
]
.into_iter()
.collect(),
}
}
pub fn for_vision() -> Self {
Self {
learning_rate: (1e-4, 1e-1),
beta1: (0.9, 0.99),
beta2: (0.999, 0.9999),
weight_decay: (1e-5, 1e-2),
epsilon: (1e-8, 1e-6),
batch_sizes: vec![16, 32, 64, 128],
log_scale_lr: true,
custom_params: HashMap::new(),
}
}
pub fn for_scientific_computing() -> Self {
Self {
learning_rate: (1e-6, 1e-2),
beta1: (0.95, 0.999),
beta2: (0.999, 0.9999),
weight_decay: (0.0, 1e-4),
epsilon: (1e-12, 1e-8),
batch_sizes: vec![32, 64, 128],
log_scale_lr: true,
custom_params: [("precision_threshold".to_string(), (1e-8, 1e-6))]
.into_iter()
.collect(),
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct HyperparameterSample {
pub learning_rate: f32,
pub beta1: f32,
pub beta2: f32,
pub weight_decay: f32,
pub epsilon: f32,
pub batch_size: usize,
pub custom_params: HashMap<String, f32>,
pub performance_score: Option<f32>,
pub training_time: Option<f32>,
pub memory_usage: Option<usize>,
}
#[derive(Debug, Clone)]
pub struct OptimizationTask {
pub name: String,
pub model_size: usize,
pub dataset_size: usize,
pub max_epochs: usize,
pub convergence_threshold: f32,
pub target_metric: String,
pub task_type: TaskType,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub enum TaskType {
Classification,
Regression,
LanguageModeling,
ComputerVision,
ScientificComputing,
Reinforcement,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PerformanceMetrics {
pub final_loss: f32,
pub convergence_epoch: usize,
pub training_time: Duration,
pub memory_peak: usize,
pub stability_score: f32,
pub throughput: f32, pub gradient_norm_variance: f32,
pub composite_score: f32,
}
#[derive(Debug)]
pub struct BayesianOptimizer {
space: HyperparameterSpace,
samples: Vec<HyperparameterSample>,
good_samples: Vec<HyperparameterSample>,
poor_samples: Vec<HyperparameterSample>,
performance_threshold: f32,
#[allow(dead_code)]
exploration_factor: f32,
n_startup_trials: usize,
gamma: f32, }
impl BayesianOptimizer {
pub fn new(space: HyperparameterSpace) -> Self {
Self {
space,
samples: Vec::new(),
good_samples: Vec::new(),
poor_samples: Vec::new(),
performance_threshold: 0.0,
exploration_factor: 0.25,
n_startup_trials: 20,
gamma: 0.25,
}
}
pub fn suggest(&mut self) -> HyperparameterSample {
if self.samples.len() < self.n_startup_trials {
self.random_sample()
} else {
self.tpe_sample()
}
}
pub fn update(&mut self, mut sample: HyperparameterSample, performance: f32) {
sample.performance_score = Some(performance);
let mut performances: Vec<f32> =
self.samples.iter().filter_map(|s| s.performance_score).collect();
performances.push(performance);
performances.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
if !performances.is_empty() {
self.performance_threshold = performances[performances.len() / 2];
}
if performance > self.performance_threshold {
self.good_samples.push(sample.clone());
} else {
self.poor_samples.push(sample.clone());
}
self.samples.push(sample);
if self.good_samples.len() > 1 {
self.good_samples.sort_by(|a, b| {
b.performance_score
.unwrap_or(0.0)
.partial_cmp(&a.performance_score.unwrap_or(0.0))
.unwrap_or(std::cmp::Ordering::Equal)
});
let keep_count = ((self.samples.len() as f32 * self.gamma).ceil() as usize).max(1);
self.good_samples.truncate(keep_count);
}
}
fn random_sample(&self) -> HyperparameterSample {
let mut rng = thread_rng();
let learning_rate = if self.space.log_scale_lr {
let log_min = self.space.learning_rate.0.ln();
let log_max = self.space.learning_rate.1.ln();
(rng.random::<f32>() * (log_max - log_min) + log_min).exp()
} else {
rng.random_range(self.space.learning_rate.0..=self.space.learning_rate.1)
};
HyperparameterSample {
learning_rate,
beta1: rng.random_range(self.space.beta1.0..=self.space.beta1.1),
beta2: rng.random_range(self.space.beta2.0..=self.space.beta2.1),
weight_decay: rng.random_range(self.space.weight_decay.0..=self.space.weight_decay.1),
epsilon: rng.random_range(self.space.epsilon.0..=self.space.epsilon.1),
batch_size: {
let idx = rng.random_range(0..self.space.batch_sizes.len());
self.space.batch_sizes[idx]
},
custom_params: self
.space
.custom_params
.iter()
.map(|(k, &(min, max))| (k.clone(), rng.random_range(min..=max)))
.collect(),
performance_score: None,
training_time: None,
memory_usage: None,
}
}
fn tpe_sample(&self) -> HyperparameterSample {
let mut rng = thread_rng();
if self.good_samples.is_empty() {
return self.random_sample();
}
let idx = rng.random_range(0..self.good_samples.len());
let good_sample = &self.good_samples[idx];
let noise_factor = 0.1;
let learning_rate = if self.space.log_scale_lr {
let log_lr = good_sample.learning_rate.ln();
let noise = rng.random_range(-noise_factor..=noise_factor);
(log_lr + noise)
.exp()
.clamp(self.space.learning_rate.0, self.space.learning_rate.1)
} else {
let noise = rng.random_range(-noise_factor..=noise_factor)
* (self.space.learning_rate.1 - self.space.learning_rate.0);
(good_sample.learning_rate + noise)
.clamp(self.space.learning_rate.0, self.space.learning_rate.1)
};
HyperparameterSample {
learning_rate,
beta1: (good_sample.beta1 + rng.random_range(-0.01..=0.01))
.clamp(self.space.beta1.0, self.space.beta1.1),
beta2: (good_sample.beta2 + rng.random_range(-0.001..=0.001))
.clamp(self.space.beta2.0, self.space.beta2.1),
weight_decay: (good_sample.weight_decay
+ rng.random_range(-noise_factor..=noise_factor)
* (self.space.weight_decay.1 - self.space.weight_decay.0))
.clamp(self.space.weight_decay.0, self.space.weight_decay.1),
epsilon: good_sample.epsilon,
batch_size: good_sample.batch_size,
custom_params: good_sample.custom_params.clone(),
performance_score: None,
training_time: None,
memory_usage: None,
}
}
pub fn get_best(&self) -> Option<&HyperparameterSample> {
self.samples.iter().filter(|s| s.performance_score.is_some()).max_by(|a, b| {
a.performance_score
.unwrap_or(0.0)
.partial_cmp(&b.performance_score.unwrap_or(0.0))
.unwrap_or(std::cmp::Ordering::Equal)
})
}
}
#[derive(Debug)]
pub struct MultiObjectiveOptimizer {
bayesian_opt: BayesianOptimizer,
#[allow(dead_code)]
objectives: Vec<String>,
weights: Vec<f32>,
pareto_front: Vec<HyperparameterSample>,
}
impl MultiObjectiveOptimizer {
pub fn new(space: HyperparameterSpace, objectives: Vec<String>, weights: Vec<f32>) -> Self {
assert_eq!(
objectives.len(),
weights.len(),
"Objectives and weights must have same length"
);
Self {
bayesian_opt: BayesianOptimizer::new(space),
objectives,
weights,
pareto_front: Vec::new(),
}
}
pub fn update_multi_objective(
&mut self,
sample: HyperparameterSample,
metrics: &PerformanceMetrics,
) {
let mut weighted_score = 0.0;
weighted_score += self.weights[0] * (1.0 / (1.0 + metrics.final_loss)); weighted_score += self.weights[1] * (1.0 / (1.0 + metrics.convergence_epoch as f32)); if self.weights.len() > 2 {
weighted_score += self.weights[2] * metrics.stability_score; }
if self.weights.len() > 3 {
weighted_score += self.weights[3] * (1.0 / (1.0 + metrics.training_time.as_secs_f32()));
}
self.bayesian_opt.update(sample, weighted_score);
self.update_pareto_front();
}
fn update_pareto_front(&mut self) {
self.pareto_front.clear();
for sample in &self.bayesian_opt.samples {
if let Some(sample_score) = sample.performance_score {
let mut is_dominated = false;
for other in &self.bayesian_opt.samples {
if let Some(other_score) = other.performance_score {
if other_score > sample_score {
is_dominated = true;
break;
}
}
}
if !is_dominated {
self.pareto_front.push(sample.clone());
}
}
}
}
}
#[derive(Debug)]
pub struct HyperparameterTuner {
optimizer_type: OptimizerType,
search_space: HyperparameterSpace,
bayesian_opt: BayesianOptimizer,
multi_objective_opt: Option<MultiObjectiveOptimizer>,
task: OptimizationTask,
max_trials: usize,
current_trial: usize,
best_config: Option<HyperparameterSample>,
optimization_history: Vec<(HyperparameterSample, PerformanceMetrics)>,
}
#[derive(Debug, Clone)]
pub enum OptimizerType {
Adam,
AdamW,
AMacP,
NovoGrad,
AveragedAdam,
Lion,
LAMB,
}
impl HyperparameterTuner {
pub fn new(
optimizer_type: OptimizerType,
search_space: HyperparameterSpace,
task: OptimizationTask,
max_trials: usize,
) -> Self {
let bayesian_opt = BayesianOptimizer::new(search_space.clone());
Self {
optimizer_type,
search_space,
bayesian_opt,
multi_objective_opt: None,
task,
max_trials,
current_trial: 0,
best_config: None,
optimization_history: Vec::new(),
}
}
pub fn enable_multi_objective(&mut self, objectives: Vec<String>, weights: Vec<f32>) {
self.multi_objective_opt = Some(MultiObjectiveOptimizer::new(
self.search_space.clone(),
objectives,
weights,
));
}
pub fn suggest_next(&mut self) -> Option<HyperparameterSample> {
if self.current_trial >= self.max_trials {
return None;
}
self.current_trial += 1;
Some(self.bayesian_opt.suggest())
}
pub fn evaluate_config(&mut self, config: HyperparameterSample) -> Result<PerformanceMetrics> {
let _start_time = Instant::now();
let metrics = self.simulate_training(&config)?;
if let Some(ref mut multi_opt) = self.multi_objective_opt {
multi_opt.update_multi_objective(config.clone(), &metrics);
} else {
self.bayesian_opt.update(config.clone(), metrics.composite_score);
}
let current_best_score =
self.best_config.as_ref().and_then(|c| c.performance_score).unwrap_or(0.0);
if self.best_config.is_none() || metrics.composite_score > current_best_score {
let mut best_config = config.clone();
best_config.performance_score = Some(metrics.composite_score);
self.best_config = Some(best_config);
}
self.optimization_history.push((config, metrics.clone()));
Ok(metrics)
}
fn simulate_training(&self, config: &HyperparameterSample) -> Result<PerformanceMetrics> {
let mut rng = thread_rng();
let lr_factor = if config.learning_rate > 1e-2 {
0.7_f64 } else if config.learning_rate < 1e-5 {
0.8_f64 } else {
1.0_f64 };
let momentum_factor = if config.beta1 > 0.95 { 0.9_f64 } else { 1.0_f64 };
let variance_factor = if config.beta2 < 0.99 { 0.85_f64 } else { 1.0_f64 };
let regularization_factor = if config.weight_decay > 1e-2 { 0.8_f64 } else { 1.0_f64 };
let base_performance = 0.8_f64;
let noise = rng.random_range(-0.1_f64..=0.1_f64);
let final_loss = (1.0_f64
- base_performance
* lr_factor
* momentum_factor
* variance_factor
* regularization_factor
+ noise)
.max(0.01_f64);
let convergence_epoch = (50.0 / lr_factor) as usize;
let training_time = Duration::from_secs((convergence_epoch as f32 * 0.1) as u64);
let memory_peak = (config.batch_size * 1024 * 1024) + rng.random_range(0..1024 * 1024);
let stability_score = momentum_factor * variance_factor;
let throughput =
(config.batch_size as f32) / (training_time.as_secs_f32() / convergence_epoch as f32);
let gradient_norm_variance = rng.random_range(0.01..=0.5);
let composite_score = (1.0_f64 / final_loss) * 0.4_f64
+ (1.0_f64 / convergence_epoch as f64) * 0.3_f64
+ stability_score * 0.2_f64
+ (throughput as f64 / 1000.0_f64).min(1.0_f64) * 0.1_f64;
Ok(PerformanceMetrics {
final_loss: final_loss as f32,
convergence_epoch,
training_time,
memory_peak,
stability_score: stability_score as f32,
throughput,
gradient_norm_variance,
composite_score: composite_score as f32,
})
}
pub fn optimize(&mut self) -> Result<HyperparameterSample> {
println!(
"🚀 Starting hyperparameter optimization for {:?}",
self.optimizer_type
);
println!(
"📊 Task: {} (max {} trials)",
self.task.name, self.max_trials
);
let mut trial_results = Vec::new();
while let Some(config) = self.suggest_next() {
println!("\n🔍 Trial {}/{}", self.current_trial, self.max_trials);
println!(
" LR: {:.2e}, β₁: {:.3}, β₂: {:.4}, WD: {:.2e}",
config.learning_rate, config.beta1, config.beta2, config.weight_decay
);
let metrics = self.evaluate_config(config.clone())?;
trial_results.push((config, metrics.clone()));
println!(
" 📈 Score: {:.4}, Loss: {:.4}, Epochs: {}, Time: {:.1}s",
metrics.composite_score,
metrics.final_loss,
metrics.convergence_epoch,
metrics.training_time.as_secs_f32()
);
if metrics.composite_score > 0.95 {
println!("🎯 Early stopping - excellent configuration found!");
break;
}
}
self.print_optimization_summary();
self.best_config.clone().ok_or_else(|| {
TrustformersError::new(trustformers_core::errors::ErrorKind::InvalidConfiguration {
field: "hyperparameter_optimization".to_string(),
reason: "No valid configuration found".to_string(),
})
})
}
fn print_optimization_summary(&self) {
println!("\n📊 Hyperparameter Optimization Summary");
println!("=====================================");
if let Some(ref best) = self.best_config {
println!("🏆 Best Configuration Found:");
println!(" Learning Rate: {:.2e}", best.learning_rate);
println!(" Beta1: {:.4}", best.beta1);
println!(" Beta2: {:.4}", best.beta2);
println!(" Weight Decay: {:.2e}", best.weight_decay);
println!(" Batch Size: {}", best.batch_size);
println!(
" Performance Score: {:.4}",
best.performance_score.unwrap_or(0.0)
);
}
println!("\n📈 Optimization Statistics:");
println!(" Total Trials: {}", self.optimization_history.len());
if !self.optimization_history.is_empty() {
let scores: Vec<f32> =
self.optimization_history.iter().map(|(_, m)| m.composite_score).collect();
let avg_score = scores.iter().sum::<f32>() / scores.len() as f32;
let max_score = scores.iter().fold(f32::NEG_INFINITY, |a, &b| a.max(b));
let min_score = scores.iter().fold(f32::INFINITY, |a, &b| a.min(b));
println!(" Average Score: {:.4}", avg_score);
println!(" Score Range: {:.4} - {:.4}", min_score, max_score);
println!(
" Improvement: {:.1}%",
((max_score - min_score) / min_score * 100.0).max(0.0)
);
}
}
pub fn get_history(&self) -> &[(HyperparameterSample, PerformanceMetrics)] {
&self.optimization_history
}
pub fn get_pareto_front(&self) -> Option<&[HyperparameterSample]> {
self.multi_objective_opt.as_ref().map(|opt| opt.pareto_front.as_slice())
}
}
impl HyperparameterTuner {
pub fn optimize_amacp_for_transformers(max_trials: usize) -> Result<AMacPConfig> {
let space = HyperparameterSpace::for_transformers();
let task = OptimizationTask {
name: "Transformer Language Modeling".to_string(),
model_size: 125_000_000, dataset_size: 1_000_000,
max_epochs: 100,
convergence_threshold: 0.01,
target_metric: "perplexity".to_string(),
task_type: TaskType::LanguageModeling,
};
let mut tuner = HyperparameterTuner::new(OptimizerType::AMacP, space, task, max_trials);
let best_config = tuner.optimize()?;
Ok(AMacPConfig {
learning_rate: best_config.learning_rate,
beta1: best_config.beta1,
beta2: best_config.beta2,
weight_decay: best_config.weight_decay,
epsilon: best_config.epsilon,
..AMacPConfig::for_transformers()
})
}
pub fn optimize_novograd_for_llms(max_trials: usize) -> Result<NovoGradConfig> {
let space = HyperparameterSpace::for_transformers();
let task = OptimizationTask {
name: "Large Language Model Training".to_string(),
model_size: 1_000_000_000, dataset_size: 10_000_000,
max_epochs: 50,
convergence_threshold: 0.005,
target_metric: "loss".to_string(),
task_type: TaskType::LanguageModeling,
};
let mut tuner = HyperparameterTuner::new(OptimizerType::NovoGrad, space, task, max_trials);
let best_config = tuner.optimize()?;
Ok(NovoGradConfig {
learning_rate: best_config.learning_rate,
beta1: best_config.beta1,
beta2: best_config.beta2,
weight_decay: best_config.weight_decay,
epsilon: best_config.epsilon,
..NovoGradConfig::for_large_language_models()
})
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_hyperparameter_space_creation() {
let space = HyperparameterSpace::default();
assert_eq!(space.learning_rate, (1e-5, 1e-1));
assert!(space.log_scale_lr);
let transformer_space = HyperparameterSpace::for_transformers();
assert!(transformer_space.custom_params.contains_key("warmup_steps"));
}
#[test]
fn test_bayesian_optimizer_suggestion() {
let space = HyperparameterSpace::default();
let mut optimizer = BayesianOptimizer::new(space);
let sample = optimizer.suggest();
assert!(sample.learning_rate >= 1e-5 && sample.learning_rate <= 1e-1);
assert!(sample.beta1 >= 0.8 && sample.beta1 <= 0.999);
}
#[test]
fn test_bayesian_optimizer_update() {
let space = HyperparameterSpace::default();
let mut optimizer = BayesianOptimizer::new(space);
let sample = optimizer.suggest();
optimizer.update(sample, 0.85);
assert_eq!(optimizer.samples.len(), 1);
assert!(optimizer.get_best().is_some());
}
#[test]
fn test_hyperparameter_tuner_creation() {
let space = HyperparameterSpace::for_vision();
let task = OptimizationTask {
name: "Test Task".to_string(),
model_size: 1000,
dataset_size: 10000,
max_epochs: 10,
convergence_threshold: 0.01,
target_metric: "accuracy".to_string(),
task_type: TaskType::Classification,
};
let tuner = HyperparameterTuner::new(OptimizerType::Adam, space, task, 50);
assert_eq!(tuner.max_trials, 50);
assert_eq!(tuner.current_trial, 0);
}
#[test]
fn test_multi_objective_optimizer() {
let space = HyperparameterSpace::default();
let objectives = vec!["accuracy".to_string(), "speed".to_string()];
let weights = vec![0.7, 0.3];
let mut optimizer = MultiObjectiveOptimizer::new(space, objectives, weights);
let sample = HyperparameterSample {
learning_rate: 1e-3,
beta1: 0.9,
beta2: 0.999,
weight_decay: 1e-4,
epsilon: 1e-8,
batch_size: 64,
custom_params: HashMap::new(),
performance_score: None,
training_time: None,
memory_usage: None,
};
let metrics = PerformanceMetrics {
final_loss: 0.1,
convergence_epoch: 25,
training_time: Duration::from_secs(120),
memory_peak: 1024 * 1024,
stability_score: 0.9,
throughput: 1000.0,
gradient_norm_variance: 0.1,
composite_score: 0.85,
};
optimizer.update_multi_objective(sample, &metrics);
assert!(!optimizer.pareto_front.is_empty());
}
#[test]
fn test_performance_metrics_calculation() {
let space = HyperparameterSpace::default();
let task = OptimizationTask {
name: "Test".to_string(),
model_size: 1000,
dataset_size: 1000,
max_epochs: 10,
convergence_threshold: 0.01,
target_metric: "loss".to_string(),
task_type: TaskType::Regression,
};
let tuner = HyperparameterTuner::new(OptimizerType::Adam, space, task, 10);
let config = HyperparameterSample {
learning_rate: 1e-3,
beta1: 0.9,
beta2: 0.999,
weight_decay: 0.0,
epsilon: 1e-8,
batch_size: 32,
custom_params: HashMap::new(),
performance_score: None,
training_time: None,
memory_usage: None,
};
let metrics = tuner.simulate_training(&config);
assert!(metrics.is_ok());
let metrics = metrics.expect("Operation failed in test");
assert!(metrics.final_loss >= 0.0);
assert!(metrics.convergence_epoch > 0);
assert!(metrics.composite_score > 0.0);
}
#[test]
fn test_convenience_optimization_functions() {
let result = HyperparameterTuner::optimize_amacp_for_transformers(5);
assert!(result.is_ok());
let result = HyperparameterTuner::optimize_novograd_for_llms(5);
assert!(result.is_ok());
}
}