scirs2_optimize/learned_optimizers/
mod.rs

1//! Learned Optimizers Module
2//!
3//! This module implements optimization algorithms that learn to optimize, including:
4//! - Meta-learning based optimizers that adapt to different problem types
5//! - Neural Architecture Search (NAS) systems
6//! - Transformer-based optimization enhancements
7//! - Few-shot learning for optimization
8//! - Adaptive neural optimizers
9//! - Learned hyperparameter tuning systems
10
11use crate::error::OptimizeError;
12use crate::error::OptimizeResult;
13use crate::result::OptimizeResults;
14use ndarray::{Array1, Array2, ArrayView1};
15use rand::{rng, Rng};
16use statrs::statistics::Statistics;
17use std::collections::HashMap;
18
19type Result<T> = std::result::Result<T, OptimizeError>;
20
21pub mod adaptive_nas_system;
22pub mod adaptive_transformer_enhancement;
23pub mod few_shot_learning_enhancement;
24pub mod learned_hyperparameter_tuner;
25pub mod meta_learning_optimizer;
26pub mod neural_adaptive_optimizer;
27
28// Use glob re-exports with allow for ambiguous names
29#[allow(ambiguous_glob_reexports)]
30pub use adaptive_nas_system::*;
31#[allow(ambiguous_glob_reexports)]
32pub use adaptive_transformer_enhancement::*;
33#[allow(ambiguous_glob_reexports)]
34pub use few_shot_learning_enhancement::*;
35#[allow(ambiguous_glob_reexports)]
36pub use learned_hyperparameter_tuner::*;
37#[allow(ambiguous_glob_reexports)]
38pub use meta_learning_optimizer::*;
39#[allow(ambiguous_glob_reexports)]
40pub use neural_adaptive_optimizer::*;
41
42/// Configuration for learned optimizers
43#[derive(Debug, Clone)]
44pub struct LearnedOptimizationConfig {
45    /// Number of meta-training episodes
46    pub meta_training_episodes: usize,
47    /// Learning rate for meta-optimizer
48    pub meta_learning_rate: f64,
49    /// Number of inner optimization steps
50    pub inner_steps: usize,
51    /// Inner learning rate
52    pub inner_learning_rate: f64,
53    /// Batch size for meta-learning
54    pub batch_size: usize,
55    /// Maximum number of parameters to optimize
56    pub max_parameters: usize,
57    /// Whether to use transformer architecture
58    pub use_transformer: bool,
59    /// Hidden size for neural networks
60    pub hidden_size: usize,
61    /// Number of attention heads (for transformer)
62    pub num_heads: usize,
63    /// Whether to enable few-shot adaptation
64    pub few_shot_adaptation: bool,
65    /// Temperature for exploration
66    pub exploration_temperature: f64,
67}
68
69impl Default for LearnedOptimizationConfig {
70    fn default() -> Self {
71        Self {
72            meta_training_episodes: 10000,
73            meta_learning_rate: 0.001,
74            inner_steps: 10,
75            inner_learning_rate: 0.01,
76            batch_size: 32,
77            max_parameters: 1000,
78            use_transformer: true,
79            hidden_size: 256,
80            num_heads: 8,
81            few_shot_adaptation: true,
82            exploration_temperature: 1.0,
83        }
84    }
85}
86
87/// Meta-learning problem specification
88#[derive(Debug, Clone)]
89pub struct OptimizationProblem {
90    /// Problem identifier
91    pub name: String,
92    /// Problem dimensionality
93    pub dimension: usize,
94    /// Problem class (e.g., "quadratic", "neural_network", "sparse")
95    pub problem_class: String,
96    /// Additional metadata
97    pub metadata: HashMap<String, f64>,
98    /// Function evaluations budget
99    pub max_evaluations: usize,
100    /// Target accuracy
101    pub target_accuracy: f64,
102}
103
104/// Training task for meta-learning
105#[derive(Debug, Clone)]
106pub struct TrainingTask {
107    /// Problem specification
108    pub problem: OptimizationProblem,
109    /// Initial parameter distribution
110    pub initial_distribution: ParameterDistribution,
111    /// Ground truth optimum (if known)
112    pub true_optimum: Option<Array1<f64>>,
113    /// Task difficulty weight
114    pub difficulty_weight: f64,
115}
116
117/// Parameter distribution for initialization
118#[derive(Debug, Clone)]
119pub enum ParameterDistribution {
120    /// Uniform distribution in range
121    Uniform { low: f64, high: f64 },
122    /// Normal distribution
123    Normal { mean: f64, std: f64 },
124    /// Custom distribution from samples
125    Custom { samples: Vec<Array1<f64>> },
126}
127
128/// Meta-optimizer state
129#[derive(Debug, Clone)]
130pub struct MetaOptimizerState {
131    /// Current meta-parameters
132    pub meta_params: Array1<f64>,
133    /// Optimizer network weights
134    pub network_weights: Array2<f64>,
135    /// Performance history
136    pub performance_history: Vec<f64>,
137    /// Adaptation statistics
138    pub adaptation_stats: AdaptationStatistics,
139    /// Current episode
140    pub episode: usize,
141}
142
143/// Statistics for tracking adaptation
144#[derive(Debug, Clone)]
145pub struct AdaptationStatistics {
146    /// Average convergence rate
147    pub avg_convergence_rate: f64,
148    /// Success rate across problems
149    pub success_rate: f64,
150    /// Average function evaluations used
151    pub avg_evaluations: f64,
152    /// Transfer learning efficiency
153    pub transfer_efficiency: f64,
154    /// Exploration-exploitation balance
155    pub exploration_ratio: f64,
156}
157
158impl Default for AdaptationStatistics {
159    fn default() -> Self {
160        Self {
161            avg_convergence_rate: 0.0,
162            success_rate: 0.0,
163            avg_evaluations: 0.0,
164            transfer_efficiency: 0.0,
165            exploration_ratio: 0.5,
166        }
167    }
168}
169
170/// Trait for learned optimizers
171pub trait LearnedOptimizer {
172    /// Meta-train the optimizer on a distribution of problems
173    fn meta_train(&mut self, training_tasks: &[TrainingTask]) -> Result<()>;
174
175    /// Adapt to a new problem with few-shot learning
176    fn adapt_to_problem(
177        &mut self,
178        problem: &OptimizationProblem,
179        initial_params: &ArrayView1<f64>,
180    ) -> Result<()>;
181
182    /// Optimize a function using learned knowledge
183    fn optimize<F>(
184        &mut self,
185        objective: F,
186        initial_params: &ArrayView1<f64>,
187    ) -> OptimizeResult<OptimizeResults<f64>>
188    where
189        F: Fn(&ArrayView1<f64>) -> f64;
190
191    /// Get current meta-optimizer state
192    fn get_state(&self) -> &MetaOptimizerState;
193
194    /// Reset the optimizer
195    fn reset(&mut self);
196}
197
198/// Neural network for learned optimization
199#[derive(Debug, Clone)]
200pub struct OptimizationNetwork {
201    /// Input embedding layer
202    pub input_embedding: Array2<f64>,
203    /// Hidden layers
204    pub hidden_layers: Vec<Array2<f64>>,
205    /// Output layer
206    pub output_layer: Array2<f64>,
207    /// Attention weights (if using transformer)
208    pub attention_weights: Option<Vec<Array2<f64>>>,
209    /// Layer normalization parameters
210    pub layer_norms: Vec<LayerNorm>,
211    /// Activation function type
212    pub activation: ActivationType,
213}
214
215/// Layer normalization parameters
216#[derive(Debug, Clone)]
217pub struct LayerNorm {
218    /// Scale parameter
219    pub gamma: Array1<f64>,
220    /// Shift parameter
221    pub beta: Array1<f64>,
222    /// Small constant for numerical stability
223    pub epsilon: f64,
224}
225
226/// Types of activation functions
227#[derive(Debug, Clone, Copy)]
228pub enum ActivationType {
229    ReLU,
230    GELU,
231    Swish,
232    Tanh,
233    LeakyReLU,
234}
235
236impl ActivationType {
237    pub fn apply(&self, x: f64) -> f64 {
238        match self {
239            ActivationType::ReLU => x.max(0.0),
240            ActivationType::GELU => {
241                x * 0.5 * (1.0 + (x * 0.7978845608 * (1.0 + 0.044715 * x * x)).tanh())
242            }
243            ActivationType::Swish => x / (1.0 + (-x).exp()),
244            ActivationType::Tanh => x.tanh(),
245            ActivationType::LeakyReLU => {
246                if x > 0.0 {
247                    x
248                } else {
249                    0.01 * x
250                }
251            }
252        }
253    }
254
255    pub fn derivative(&self, x: f64) -> f64 {
256        match self {
257            ActivationType::ReLU => {
258                if x > 0.0 {
259                    1.0
260                } else {
261                    0.0
262                }
263            }
264            ActivationType::GELU => {
265                let tanh_arg = x * 0.7978845608 * (1.0 + 0.044715 * x * x);
266                let tanh_val = tanh_arg.tanh();
267                0.5 * (1.0 + tanh_val)
268                    + x * 0.5
269                        * (1.0 - tanh_val * tanh_val)
270                        * 0.7978845608
271                        * (1.0 + 0.134145 * x * x)
272            }
273            ActivationType::Swish => {
274                let sigmoid = 1.0 / (1.0 + (-x).exp());
275                sigmoid * (1.0 + x * (1.0 - sigmoid))
276            }
277            ActivationType::Tanh => {
278                let t = x.tanh();
279                1.0 - t * t
280            }
281            ActivationType::LeakyReLU => {
282                if x > 0.0 {
283                    1.0
284                } else {
285                    0.01
286                }
287            }
288        }
289    }
290}
291
292impl OptimizationNetwork {
293    /// Create new optimization network
294    pub fn new(
295        input_size: usize,
296        hidden_sizes: Vec<usize>,
297        output_size: usize,
298        use_attention: bool,
299        activation: ActivationType,
300    ) -> Self {
301        let mut hidden_layers = Vec::new();
302        let mut layer_norms = Vec::new();
303
304        // Create layers
305        let mut prev_size = input_size;
306        for &hidden_size in &hidden_sizes {
307            let weights = Array2::from_shape_fn((hidden_size, prev_size), |_| {
308                rand::rng().random_range(-0.5..0.5) * (2.0 / prev_size as f64).sqrt()
309            });
310            hidden_layers.push(weights);
311
312            // Layer normalization
313            layer_norms.push(LayerNorm {
314                gamma: Array1::ones(hidden_size),
315                beta: Array1::zeros(hidden_size),
316                epsilon: 1e-6,
317            });
318
319            prev_size = hidden_size;
320        }
321
322        // Input embedding
323        let input_embedding = Array2::from_shape_fn((hidden_sizes[0], input_size), |_| {
324            rand::rng().random_range(-0.5..0.5) * (2.0 / input_size as f64).sqrt()
325        });
326
327        // Output layer
328        let output_layer = Array2::from_shape_fn((output_size, prev_size), |_| {
329            rand::rng().random_range(-0.5..0.5) * (2.0 / prev_size as f64).sqrt()
330        });
331
332        // Attention weights (simplified)
333        let attention_weights = if use_attention {
334            Some(vec![Array2::from_shape_fn((prev_size, prev_size), |_| {
335                rand::rng().random_range(-0.5..0.5) * (2.0 / prev_size as f64).sqrt()
336            })])
337        } else {
338            None
339        };
340
341        Self {
342            input_embedding,
343            hidden_layers,
344            output_layer,
345            attention_weights,
346            layer_norms,
347            activation,
348        }
349    }
350
351    /// Forward pass through the network
352    pub fn forward(&self, input: &ArrayView1<f64>) -> Array1<f64> {
353        // Input embedding
354        let mut current = Array1::zeros(self.input_embedding.nrows());
355        for i in 0..current.len() {
356            for j in 0..input.len().min(self.input_embedding.ncols()) {
357                current[i] += self.input_embedding[[i, j]] * input[j];
358            }
359        }
360
361        // Apply activation
362        current.mapv_inplace(|x| self.activation.apply(x));
363
364        // Hidden layers with layer normalization
365        for (layer_idx, layer) in self.hidden_layers.iter().enumerate() {
366            let mut next = Array1::zeros(layer.nrows());
367
368            // Linear transformation
369            for i in 0..next.len() {
370                for j in 0..current.len().min(layer.ncols()) {
371                    next[i] += layer[[i, j]] * current[j];
372                }
373            }
374
375            // Layer normalization
376            if layer_idx < self.layer_norms.len() {
377                let layer_norm = &self.layer_norms[layer_idx];
378                let mean = next.view().mean();
379                let var = next.view().variance();
380                let std = (var + layer_norm.epsilon).sqrt();
381
382                for i in 0..next.len() {
383                    if i < layer_norm.gamma.len() && i < layer_norm.beta.len() {
384                        next[i] = layer_norm.gamma[i] * (next[i] - mean) / std + layer_norm.beta[i];
385                    }
386                }
387            }
388
389            // Apply attention (simplified)
390            if let Some(ref attention) = self.attention_weights {
391                if !attention.is_empty() {
392                    let attn_weights = &attention[0];
393                    let mut attended: Array1<f64> = Array1::zeros(next.len());
394
395                    for i in 0..attended.len() {
396                        for j in 0..next.len().min(attn_weights.ncols()) {
397                            attended[i] += attn_weights[[i, j]] * next[j];
398                        }
399                    }
400
401                    // Residual connection
402                    next = &next + &attended;
403                }
404            }
405
406            // Activation
407            next.mapv_inplace(|x| self.activation.apply(x));
408            current = next;
409        }
410
411        // Output layer
412        let mut output = Array1::zeros(self.output_layer.nrows());
413        for i in 0..output.len() {
414            for j in 0..current.len().min(self.output_layer.ncols()) {
415                output[i] += self.output_layer[[i, j]] * current[j];
416            }
417        }
418
419        output
420    }
421}
422
423/// Problem encoder for creating embeddings
424#[derive(Debug, Clone)]
425pub struct ProblemEncoder {
426    /// Dimensionality features
427    pub dim_encoder: Array2<f64>,
428    /// Gradient features encoder
429    pub gradient_encoder: Array2<f64>,
430    /// Hessian features encoder
431    pub hessian_encoder: Array2<f64>,
432    /// Output embedding size
433    pub embedding_size: usize,
434}
435
436impl ProblemEncoder {
437    /// Create new problem encoder
438    pub fn new(embedding_size: usize) -> Self {
439        let dim = 10; // Feature dimensions for different aspects
440
441        Self {
442            dim_encoder: Array2::from_shape_fn((embedding_size, dim), |_| {
443                rand::rng().random_range(-0.5..0.5) * 0.1
444            }),
445            gradient_encoder: Array2::from_shape_fn((embedding_size, dim), |_| {
446                rand::rng().random_range(-0.5..0.5) * 0.1
447            }),
448            hessian_encoder: Array2::from_shape_fn((embedding_size, dim), |_| {
449                rand::rng().random_range(-0.5..0.5) * 0.1
450            }),
451            embedding_size,
452        }
453    }
454
455    /// Encode a problem into an embedding
456    pub fn encode_problem<F>(
457        &self,
458        objective: &F,
459        current_params: &ArrayView1<f64>,
460        problem: &OptimizationProblem,
461    ) -> Array1<f64>
462    where
463        F: Fn(&ArrayView1<f64>) -> f64,
464    {
465        let mut embedding = Array1::zeros(self.embedding_size);
466
467        // Compute basic features
468        let dim_features = self.compute_dimensionality_features(current_params, problem);
469        let grad_features = self.compute_gradient_features(objective, current_params);
470        let hessian_features = self.compute_hessian_features(objective, current_params);
471
472        // Combine features
473        for i in 0..self.embedding_size {
474            for j in 0..dim_features.len().min(self.dim_encoder.ncols()) {
475                embedding[i] += self.dim_encoder[[i, j]] * dim_features[j];
476            }
477            for j in 0..grad_features.len().min(self.gradient_encoder.ncols()) {
478                embedding[i] += self.gradient_encoder[[i, j]] * grad_features[j];
479            }
480            for j in 0..hessian_features.len().min(self.hessian_encoder.ncols()) {
481                embedding[i] += self.hessian_encoder[[i, j]] * hessian_features[j];
482            }
483        }
484
485        embedding
486    }
487
488    fn compute_dimensionality_features(
489        &self,
490        params: &ArrayView1<f64>,
491        problem: &OptimizationProblem,
492    ) -> Array1<f64> {
493        let mut features = Array1::zeros(10);
494
495        features[0] = (params.len() as f64).ln(); // Log dimensionality
496        features[1] = params.view().variance(); // Parameter variance
497        features[2] = params.view().mean(); // Parameter mean
498        features[3] = params.iter().map(|&x| x.abs()).sum::<f64>() / params.len() as f64; // L1 norm
499        features[4] = (params.iter().map(|&x| x * x).sum::<f64>()).sqrt(); // L2 norm
500        features[5] = problem.dimension as f64 / 1000.0; // Normalized dimension
501        features[6] = problem.max_evaluations as f64 / 10000.0; // Normalized budget
502        features[7] = problem.target_accuracy.ln().abs(); // Log target accuracy
503
504        // Add problem-specific metadata
505        if let Some(&complexity) = problem.metadata.get("complexity") {
506            features[8] = complexity.tanh();
507        }
508        if let Some(&sparsity) = problem.metadata.get("sparsity") {
509            features[9] = sparsity;
510        }
511
512        features
513    }
514
515    fn compute_gradient_features<F>(&self, objective: &F, params: &ArrayView1<f64>) -> Array1<f64>
516    where
517        F: Fn(&ArrayView1<f64>) -> f64,
518    {
519        let mut features = Array1::zeros(10);
520        let h = 1e-6;
521        let f0 = objective(params);
522
523        let mut gradient_norm = 0.0;
524        let mut gradient_components = Vec::new();
525
526        // Compute finite difference gradient
527        for i in 0..params.len().min(20) {
528            // Limit for efficiency
529            let mut params_plus = params.to_owned();
530            params_plus[i] += h;
531            let f_plus = objective(&params_plus.view());
532            let grad_i = (f_plus - f0) / h;
533            gradient_components.push(grad_i);
534            gradient_norm += grad_i * grad_i;
535        }
536
537        gradient_norm = gradient_norm.sqrt();
538
539        features[0] = gradient_norm.ln().tanh(); // Log gradient norm
540        features[1] = f0.abs().ln().tanh(); // Log objective value
541
542        if !gradient_components.is_empty() {
543            let grad_mean =
544                gradient_components.iter().sum::<f64>() / gradient_components.len() as f64;
545            let grad_var = gradient_components
546                .iter()
547                .map(|&g| (g - grad_mean).powi(2))
548                .sum::<f64>()
549                / gradient_components.len() as f64;
550
551            features[2] = grad_mean.tanh();
552            features[3] = grad_var.sqrt().tanh();
553            features[4] = gradient_components
554                .iter()
555                .map(|&g| g.abs())
556                .max_by(|a, b| a.partial_cmp(b).unwrap())
557                .unwrap_or(0.0)
558                .tanh();
559            features[5] = gradient_components
560                .iter()
561                .map(|&g| g.abs())
562                .min_by(|a, b| a.partial_cmp(b).unwrap())
563                .unwrap_or(0.0)
564                .tanh();
565        }
566
567        features
568    }
569
570    fn compute_hessian_features<F>(&self, objective: &F, params: &ArrayView1<f64>) -> Array1<f64>
571    where
572        F: Fn(&ArrayView1<f64>) -> f64,
573    {
574        let mut features = Array1::zeros(10);
575        let h = 1e-4;
576        let f0 = objective(params);
577
578        // Compute a few diagonal Hessian elements for efficiency
579        for i in 0..params.len().min(5) {
580            let mut params_plus = params.to_owned();
581            let mut params_minus = params.to_owned();
582
583            params_plus[i] += h;
584            params_minus[i] -= h;
585
586            let f_plus = objective(&params_plus.view());
587            let f_minus = objective(&params_minus.view());
588
589            let hessian_ii = (f_plus - 2.0 * f0 + f_minus) / (h * h);
590
591            if i < features.len() {
592                features[i] = hessian_ii.tanh();
593            }
594        }
595
596        features
597    }
598}
599
600/// Convenience function for learned optimization
601#[allow(dead_code)]
602pub fn learned_optimize<F>(
603    objective: F,
604    initial_params: &ArrayView1<f64>,
605    config: Option<LearnedOptimizationConfig>,
606) -> OptimizeResult<OptimizeResults<f64>>
607where
608    F: Fn(&ArrayView1<f64>) -> f64,
609{
610    let config = config.unwrap_or_default();
611
612    // Create meta-learning optimizer
613    let mut optimizer = MetaLearningOptimizer::new(config);
614
615    // Simple optimization without extensive meta-training
616    optimizer.optimize(objective, initial_params)
617}
618
619#[cfg(test)]
620mod tests {
621    use super::*;
622
623    #[test]
624    fn test_learned_optimization_config() {
625        let config = LearnedOptimizationConfig::default();
626        assert_eq!(config.meta_training_episodes, 10000);
627        assert_eq!(config.hidden_size, 256);
628        assert!(config.use_transformer);
629    }
630
631    #[test]
632    fn test_optimization_network_creation() {
633        let network = OptimizationNetwork::new(10, vec![32, 32], 5, true, ActivationType::GELU);
634
635        assert_eq!(network.hidden_layers.len(), 2);
636        assert_eq!(network.layer_norms.len(), 2);
637        assert!(network.attention_weights.is_some());
638    }
639
640    #[test]
641    fn test_network_forward_pass() {
642        let network = OptimizationNetwork::new(5, vec![10], 3, false, ActivationType::ReLU);
643
644        let input = Array1::from(vec![1.0, 2.0, 3.0, 4.0, 5.0]);
645        let output = network.forward(&input.view());
646
647        assert_eq!(output.len(), 3);
648        assert!(output.iter().all(|&x| x.is_finite()));
649    }
650
651    #[test]
652    fn test_activation_functions() {
653        assert_eq!(ActivationType::ReLU.apply(-1.0), 0.0);
654        assert_eq!(ActivationType::ReLU.apply(1.0), 1.0);
655        assert!(ActivationType::GELU.apply(0.0).abs() < 0.1);
656        assert!(ActivationType::Swish.apply(0.0).abs() < 0.1);
657    }
658
659    #[test]
660    fn test_problem_encoder() {
661        let encoder = ProblemEncoder::new(32);
662        let params = Array1::from(vec![1.0, 2.0]);
663        let objective = |x: &ArrayView1<f64>| x[0].powi(2) + x[1].powi(2);
664
665        let problem = OptimizationProblem {
666            name: "test".to_string(),
667            dimension: 2,
668            problem_class: "quadratic".to_string(),
669            metadata: HashMap::new(),
670            max_evaluations: 1000,
671            target_accuracy: 1e-6,
672        };
673
674        let embedding = encoder.encode_problem(&objective, &params.view(), &problem);
675        assert_eq!(embedding.len(), 32);
676        assert!(embedding.iter().all(|&x| x.is_finite()));
677    }
678
679    #[test]
680    fn test_basic_learned_optimization() {
681        let objective = |x: &ArrayView1<f64>| x[0].powi(2) + x[1].powi(2);
682        let initial = Array1::from(vec![2.0, 2.0]);
683
684        let config = LearnedOptimizationConfig {
685            meta_training_episodes: 10,
686            inner_steps: 5,
687            ..Default::default()
688        };
689
690        let result = learned_optimize(objective, &initial.view(), Some(config)).unwrap();
691
692        assert!(result.fun >= 0.0);
693        assert_eq!(result.x.len(), 2);
694    }
695}
696
697#[allow(dead_code)]
698pub fn placeholder() {
699    // Placeholder function to prevent unused module warnings
700}