scirs2_optimize/learned_optimizers/
adaptive_transformer_enhancement.rs

1//! Adaptive Transformer Enhancement for Optimization
2//!
3//! This module implements transformer-based neural architectures that adaptively
4//! enhance optimization algorithms. The transformers learn to attend to different
5//! aspects of the optimization landscape and adapt their strategies accordingly.
6
7use super::{
8    ActivationType, LearnedOptimizationConfig, LearnedOptimizer, MetaOptimizerState,
9    OptimizationProblem, TrainingTask,
10};
11use crate::error::{OptimizeError, OptimizeResult};
12use crate::result::OptimizeResults;
13use scirs2_core::ndarray::{Array1, Array2, ArrayView1, ArrayView2};
14use scirs2_core::random::Rng;
15use statrs::statistics::Statistics;
16use std::collections::{HashMap, VecDeque};
17
18/// Adaptive Transformer-Enhanced Optimizer
19#[derive(Debug, Clone)]
20pub struct AdaptiveTransformerOptimizer {
21    /// Configuration
22    config: LearnedOptimizationConfig,
23    /// Multi-head transformer
24    transformer: OptimizationTransformer,
25    /// Problem encoder
26    problem_encoder: TransformerProblemEncoder,
27    /// Optimization history buffer
28    history_buffer: OptimizationHistory,
29    /// Meta-optimizer state
30    meta_state: MetaOptimizerState,
31    /// Adaptive components
32    adaptive_components: AdaptiveComponents,
33    /// Performance metrics
34    performance_metrics: TransformerMetrics,
35}
36
37/// Transformer architecture for optimization
38#[derive(Debug, Clone)]
39pub struct OptimizationTransformer {
40    /// Number of transformer layers
41    num_layers: usize,
42    /// Transformer blocks
43    transformer_blocks: Vec<TransformerBlock>,
44    /// Position encoding
45    position_encoding: Array2<f64>,
46    /// Input embedding layer
47    input_embedding: Array2<f64>,
48    /// Output projection layer
49    output_projection: Array2<f64>,
50    /// Model dimension
51    model_dim: usize,
52}
53
54/// Single transformer block
55#[derive(Debug, Clone)]
56pub struct TransformerBlock {
57    /// Multi-head attention
58    attention: MultiHeadAttention,
59    /// Feed-forward network
60    feed_forward: FeedForwardNetwork,
61    /// Layer normalization 1
62    layer_norm1: LayerNormalization,
63    /// Layer normalization 2
64    layer_norm2: LayerNormalization,
65    /// Dropout rate
66    dropout_rate: f64,
67}
68
69/// Multi-head attention mechanism
70#[derive(Debug, Clone)]
71pub struct MultiHeadAttention {
72    /// Number of attention heads
73    num_heads: usize,
74    /// Head dimension
75    head_dim: usize,
76    /// Query weights
77    w_query: Array2<f64>,
78    /// Key weights
79    w_key: Array2<f64>,
80    /// Value weights
81    w_value: Array2<f64>,
82    /// Output projection
83    w_output: Array2<f64>,
84    /// Attention scores history (for analysis)
85    attention_scores: Vec<Array2<f64>>,
86}
87
88/// Feed-forward network
89#[derive(Debug, Clone)]
90pub struct FeedForwardNetwork {
91    /// First linear layer
92    linear1: Array2<f64>,
93    /// Second linear layer
94    linear2: Array2<f64>,
95    /// Bias terms
96    bias1: Array1<f64>,
97    /// Bias terms
98    bias2: Array1<f64>,
99    /// Activation function
100    activation: ActivationType,
101    /// Hidden dimension
102    hidden_dim: usize,
103}
104
105/// Layer normalization
106#[derive(Debug, Clone)]
107pub struct LayerNormalization {
108    /// Scale parameters
109    gamma: Array1<f64>,
110    /// Shift parameters
111    beta: Array1<f64>,
112    /// Epsilon for numerical stability
113    epsilon: f64,
114}
115
116/// Problem encoder for transformers
117#[derive(Debug, Clone)]
118pub struct TransformerProblemEncoder {
119    /// Gradient encoding layer
120    gradient_encoder: Array2<f64>,
121    /// Hessian encoding layer
122    hessian_encoder: Array2<f64>,
123    /// Parameter encoding layer
124    parameter_encoder: Array2<f64>,
125    /// Temporal encoding layer
126    temporal_encoder: Array2<f64>,
127    /// Context encoding layer
128    context_encoder: Array2<f64>,
129    /// Embedding dimension
130    embedding_dim: usize,
131}
132
133/// Optimization history for transformer context
134#[derive(Debug, Clone)]
135pub struct OptimizationHistory {
136    /// Parameter history
137    parameter_history: VecDeque<Array1<f64>>,
138    /// Objective value history
139    objective_history: VecDeque<f64>,
140    /// Gradient history
141    gradient_history: VecDeque<Array1<f64>>,
142    /// Step size history
143    step_size_history: VecDeque<f64>,
144    /// Success/failure history
145    success_history: VecDeque<bool>,
146    /// Maximum history length
147    max_length: usize,
148    /// Current step
149    current_step: usize,
150}
151
152/// Adaptive components for transformer optimization
153#[derive(Debug, Clone)]
154pub struct AdaptiveComponents {
155    /// Adaptive attention weights
156    attention_adaptation: AttentionAdaptation,
157    /// Dynamic learning rate scheduler
158    learning_rate_adapter: LearningRateAdapter,
159    /// Gradient scaling mechanism
160    gradient_scaler: GradientScaler,
161    /// Step size predictor
162    step_size_predictor: StepSizePredictor,
163    /// Convergence detector
164    convergence_detector: ConvergenceDetector,
165}
166
167/// Attention adaptation mechanism
168#[derive(Debug, Clone)]
169pub struct AttentionAdaptation {
170    /// Adaptation rate
171    adaptation_rate: f64,
172    /// Current attention focus
173    attention_focus: Array1<f64>,
174    /// Focus history
175    focus_history: VecDeque<Array1<f64>>,
176    /// Problem-specific attention patterns
177    problem_patterns: HashMap<String, Array1<f64>>,
178}
179
180/// Learning rate adapter
181#[derive(Debug, Clone)]
182pub struct LearningRateAdapter {
183    /// Base learning rate
184    base_lr: f64,
185    /// Current learning rate
186    current_lr: f64,
187    /// Adaptation parameters
188    adaptation_params: Array1<f64>,
189    /// Performance window
190    performance_window: VecDeque<f64>,
191    /// Adaptation history
192    lr_history: Vec<f64>,
193}
194
195/// Gradient scaling mechanism
196#[derive(Debug, Clone)]
197pub struct GradientScaler {
198    /// Scaling factors
199    scale_factors: Array1<f64>,
200    /// Gradient statistics
201    gradient_stats: GradientStatistics,
202    /// Adaptive scaling parameters
203    scaling_params: Array1<f64>,
204}
205
206/// Gradient statistics
207#[derive(Debug, Clone)]
208pub struct GradientStatistics {
209    /// Running mean
210    mean: Array1<f64>,
211    /// Running variance
212    variance: Array1<f64>,
213    /// Update count
214    count: usize,
215    /// Momentum parameter
216    momentum: f64,
217}
218
219/// Step size predictor
220#[derive(Debug, Clone)]
221pub struct StepSizePredictor {
222    /// Prediction network
223    predictor_network: Array2<f64>,
224    /// Input features
225    feature_dim: usize,
226    /// Prediction history
227    prediction_history: Vec<f64>,
228    /// Actual step sizes
229    actual_steps: Vec<f64>,
230}
231
232/// Convergence detector
233#[derive(Debug, Clone)]
234pub struct ConvergenceDetector {
235    /// Detection threshold
236    threshold: f64,
237    /// Window size for analysis
238    window_size: usize,
239    /// Recent improvements
240    recent_improvements: VecDeque<f64>,
241    /// Convergence probability
242    convergence_prob: f64,
243}
244
245/// Performance metrics for transformer
246#[derive(Debug, Clone)]
247pub struct TransformerMetrics {
248    /// Attention entropy
249    attention_entropy: f64,
250    /// Learning rate adaptation efficiency
251    lr_adaptation_efficiency: f64,
252    /// Gradient prediction accuracy
253    gradient_prediction_accuracy: f64,
254    /// Step size prediction accuracy
255    step_size_prediction_accuracy: f64,
256    /// Convergence detection accuracy
257    convergence_detection_accuracy: f64,
258}
259
260impl AdaptiveTransformerOptimizer {
261    /// Create new adaptive transformer optimizer
262    pub fn new(config: LearnedOptimizationConfig) -> Self {
263        let model_dim = config.hidden_size;
264        let transformer = OptimizationTransformer::new(
265            config.num_heads,
266            model_dim,
267            config.max_parameters,
268            6, // num_layers
269        );
270
271        let problem_encoder = TransformerProblemEncoder::new(model_dim);
272        let history_buffer = OptimizationHistory::new(100);
273
274        Self {
275            config,
276            transformer,
277            problem_encoder,
278            history_buffer,
279            meta_state: MetaOptimizerState {
280                meta_params: Array1::zeros(model_dim),
281                network_weights: Array2::zeros((model_dim, model_dim)),
282                performance_history: Vec::new(),
283                adaptation_stats: super::AdaptationStatistics::default(),
284                episode: 0,
285            },
286            adaptive_components: AdaptiveComponents::new(model_dim),
287            performance_metrics: TransformerMetrics::default(),
288        }
289    }
290
291    /// Process optimization step with transformer
292    pub fn process_optimization_step<F>(
293        &mut self,
294        objective: &F,
295        current_params: &ArrayView1<f64>,
296        problem: &OptimizationProblem,
297    ) -> OptimizeResult<OptimizationStep>
298    where
299        F: Fn(&ArrayView1<f64>) -> f64,
300    {
301        // Encode current state
302        let state_encoding = self.encode_optimization_state(objective, current_params, problem)?;
303
304        // Process through transformer
305        let transformer_output = self.transformer.forward(&state_encoding.view())?;
306
307        // Extract optimization decisions
308        let optimization_step = self.decode_optimization_step(&transformer_output.view())?;
309
310        // Update adaptive components
311        self.update_adaptive_components(&optimization_step)?;
312
313        // Record in history
314        self.history_buffer.add_step(
315            current_params.to_owned(),
316            objective(current_params),
317            optimization_step.clone(),
318        );
319
320        Ok(optimization_step)
321    }
322
323    /// Encode current optimization state
324    fn encode_optimization_state<F>(
325        &self,
326        objective: &F,
327        current_params: &ArrayView1<f64>,
328        problem: &OptimizationProblem,
329    ) -> OptimizeResult<Array2<f64>>
330    where
331        F: Fn(&ArrayView1<f64>) -> f64,
332    {
333        let seq_len = self.history_buffer.current_step.min(50) + 1; // Include current state
334        let model_dim = self.transformer.model_dim;
335        let mut sequence = Array2::zeros((seq_len, model_dim));
336
337        // Encode historical states
338        for i in 0..seq_len - 1 {
339            if let Some(historical_encoding) = self.encode_historical_state(i) {
340                for j in 0..model_dim.min(historical_encoding.len()) {
341                    sequence[[i, j]] = historical_encoding[j];
342                }
343            }
344        }
345
346        // Encode current state
347        let current_encoding =
348            self.problem_encoder
349                .encode_current_state(objective, current_params, problem)?;
350
351        let last_idx = seq_len - 1;
352        for j in 0..model_dim.min(current_encoding.len()) {
353            sequence[[last_idx, j]] = current_encoding[j];
354        }
355
356        Ok(sequence)
357    }
358
359    /// Encode historical state
360    fn encode_historical_state(&self, history_index: usize) -> Option<Array1<f64>> {
361        if history_index >= self.history_buffer.parameter_history.len() {
362            return None;
363        }
364
365        let params = &self.history_buffer.parameter_history[history_index];
366        let obj_val = self.history_buffer.objective_history[history_index];
367
368        // Create encoding from historical data
369        let mut encoding = Array1::zeros(self.transformer.model_dim);
370
371        // Parameter features
372        for (i, &param) in params.iter().enumerate() {
373            if i < encoding.len() / 4 {
374                encoding[i] = param.tanh();
375            }
376        }
377
378        // Objective value features
379        let obj_idx = encoding.len() / 4;
380        if obj_idx < encoding.len() {
381            encoding[obj_idx] = obj_val.ln().abs().tanh();
382        }
383
384        // Gradient features (if available)
385        if let Some(gradient) = self.history_buffer.gradient_history.get(history_index) {
386            let grad_start = encoding.len() / 2;
387            for (i, &grad) in gradient.iter().enumerate() {
388                if grad_start + i < encoding.len() {
389                    encoding[grad_start + i] = grad.tanh();
390                }
391            }
392        }
393
394        Some(encoding)
395    }
396
397    /// Decode optimization step from transformer output
398    fn decode_optimization_step(
399        &self,
400        transformer_output: &ArrayView2<f64>,
401    ) -> OptimizeResult<OptimizationStep> {
402        if transformer_output.is_empty() {
403            return Err(OptimizeError::InvalidInput(
404                "Empty transformer _output".to_string(),
405            ));
406        }
407
408        // Extract last timestep _output
409        let last_output = transformer_output.row(transformer_output.nrows() - 1);
410
411        // Decode step size
412        let step_size_raw = last_output.get(0).copied().unwrap_or(0.0);
413        let step_size = (step_size_raw.tanh() + 1.0) * 0.01; // Map to [0, 0.02]
414
415        // Decode direction (simplified)
416        let direction_dim = self.meta_state.meta_params.len().min(last_output.len() - 1);
417        let mut direction = Array1::zeros(direction_dim);
418        for i in 0..direction_dim {
419            direction[i] = last_output.get(i + 1).copied().unwrap_or(0.0).tanh();
420        }
421
422        // Decode learning rate adaptation
423        let lr_factor_raw = last_output
424            .get(last_output.len() / 2)
425            .copied()
426            .unwrap_or(0.0);
427        let lr_adaptation_factor = (lr_factor_raw.tanh() + 1.0) * 0.5 + 0.5; // Map to [0.5, 1.5]
428
429        // Decode convergence confidence
430        let conv_raw = last_output
431            .get(last_output.len() - 1)
432            .copied()
433            .unwrap_or(0.0);
434        let convergence_confidence = (conv_raw.tanh() + 1.0) * 0.5; // Map to [0, 1]
435
436        Ok(OptimizationStep {
437            step_size,
438            direction,
439            lr_adaptation_factor,
440            convergence_confidence,
441            attention_weights: self.get_attention_weights(),
442        })
443    }
444
445    /// Get current attention weights for analysis
446    fn get_attention_weights(&self) -> Array2<f64> {
447        if let Some(first_block) = self.transformer.transformer_blocks.first() {
448            if let Some(last_attention) = first_block.attention.attention_scores.last() {
449                return last_attention.clone();
450            }
451        }
452        Array2::zeros((1, 1))
453    }
454
455    /// Update adaptive components
456    fn update_adaptive_components(&mut self, step: &OptimizationStep) -> OptimizeResult<()> {
457        // Update attention adaptation
458        self.adaptive_components
459            .attention_adaptation
460            .update(&step.attention_weights)?;
461
462        // Update learning rate adapter
463        self.adaptive_components
464            .learning_rate_adapter
465            .update(step.lr_adaptation_factor)?;
466
467        // Update convergence detector
468        self.adaptive_components
469            .convergence_detector
470            .update(step.convergence_confidence)?;
471
472        Ok(())
473    }
474
475    /// Adapt transformer to specific problem characteristics
476    pub fn adapt_to_problem_class(&mut self, problem_class: &str) -> OptimizeResult<()> {
477        // Adjust attention patterns based on problem type
478        match problem_class {
479            "quadratic" => {
480                // Focus more on recent gradients
481                self.adaptive_components
482                    .attention_adaptation
483                    .set_focus_pattern(
484                        Array1::from(vec![0.1, 0.2, 0.7]), // Recent bias
485                    );
486            }
487            "neural_network" => {
488                // Balance between recent and historical information
489                self.adaptive_components
490                    .attention_adaptation
491                    .set_focus_pattern(
492                        Array1::from(vec![0.3, 0.4, 0.3]), // Balanced
493                    );
494            }
495            "sparse" => {
496                // Focus on gradient magnitude patterns
497                self.adaptive_components
498                    .attention_adaptation
499                    .set_focus_pattern(
500                        Array1::from(vec![0.5, 0.3, 0.2]), // Historical bias
501                    );
502            }
503            _ => {
504                // Default balanced pattern
505                self.adaptive_components
506                    .attention_adaptation
507                    .set_focus_pattern(Array1::from(vec![0.3, 0.4, 0.3]));
508            }
509        }
510
511        Ok(())
512    }
513
514    /// Fine-tune transformer on specific optimization trajectories
515    pub fn fine_tune_on_trajectories(
516        &mut self,
517        trajectories: &[OptimizationTrajectory],
518    ) -> OptimizeResult<()> {
519        for trajectory in trajectories {
520            // Process each step in trajectory
521            for step in &trajectory.steps {
522                // Update transformer weights based on successful steps
523                if step.improvement > 0.0 {
524                    self.update_transformer_weights(&step.state_encoding, &step.action_encoding)?;
525                }
526            }
527        }
528
529        Ok(())
530    }
531
532    fn update_transformer_weights(
533        &mut self,
534        state_encoding: &Array2<f64>,
535        action_encoding: &Array1<f64>,
536    ) -> OptimizeResult<()> {
537        // Simplified weight update (in practice would use proper backpropagation)
538        let learning_rate = self.config.meta_learning_rate;
539
540        // Update output projection to better predict successful actions
541        for i in 0..self
542            .transformer
543            .output_projection
544            .nrows()
545            .min(action_encoding.len())
546        {
547            for j in 0..self.transformer.output_projection.ncols() {
548                if let Some(&state_val) = state_encoding.get((state_encoding.nrows() - 1, j)) {
549                    self.transformer.output_projection[[i, j]] +=
550                        learning_rate * action_encoding[i] * state_val;
551                }
552            }
553        }
554
555        Ok(())
556    }
557
558    /// Get transformer performance metrics
559    pub fn get_performance_metrics(&self) -> &TransformerMetrics {
560        &self.performance_metrics
561    }
562
563    /// Update performance metrics
564    fn update_performance_metrics(&mut self) {
565        // Compute attention entropy
566        if let Some(attention_scores) = self.get_latest_attention_scores() {
567            self.performance_metrics.attention_entropy =
568                compute_attention_entropy(&attention_scores);
569        }
570
571        // Update other metrics based on adaptive components
572        self.performance_metrics.lr_adaptation_efficiency = self
573            .adaptive_components
574            .learning_rate_adapter
575            .get_efficiency();
576
577        self.performance_metrics.convergence_detection_accuracy =
578            self.adaptive_components.convergence_detector.get_accuracy();
579    }
580
581    fn get_latest_attention_scores(&self) -> Option<Array2<f64>> {
582        self.transformer
583            .transformer_blocks
584            .first()?
585            .attention
586            .attention_scores
587            .last()
588            .cloned()
589    }
590}
591
592/// Optimization step output from transformer
593#[derive(Debug, Clone)]
594pub struct OptimizationStep {
595    /// Suggested step size
596    pub step_size: f64,
597    /// Search direction
598    pub direction: Array1<f64>,
599    /// Learning rate adaptation factor
600    pub lr_adaptation_factor: f64,
601    /// Confidence in convergence
602    pub convergence_confidence: f64,
603    /// Attention weights for interpretability
604    pub attention_weights: Array2<f64>,
605}
606
607/// Optimization trajectory for training
608#[derive(Debug, Clone)]
609pub struct OptimizationTrajectory {
610    /// Steps in the trajectory
611    pub steps: Vec<TrajectoryStep>,
612    /// Final objective value
613    pub final_objective: f64,
614    /// Success indicator
615    pub success: bool,
616}
617
618/// Single step in optimization trajectory
619#[derive(Debug, Clone)]
620pub struct TrajectoryStep {
621    /// State encoding at this step
622    pub state_encoding: Array2<f64>,
623    /// Action taken
624    pub action_encoding: Array1<f64>,
625    /// Improvement achieved
626    pub improvement: f64,
627    /// Step number
628    pub step_number: usize,
629}
630
631impl OptimizationTransformer {
632    /// Create new optimization transformer
633    pub fn new(num_heads: usize, model_dim: usize, max_seq_len: usize, num_layers: usize) -> Self {
634        let mut transformer_blocks = Vec::new();
635
636        for _ in 0..num_layers {
637            transformer_blocks.push(TransformerBlock::new(num_heads, model_dim));
638        }
639
640        // Position encoding
641        let position_encoding = Self::create_position_encoding(max_seq_len, model_dim);
642
643        // Input embedding
644        let input_embedding = Array2::from_shape_fn((model_dim, model_dim), |_| {
645            (scirs2_core::random::rng().random::<f64>() - 0.5) * (2.0 / model_dim as f64).sqrt()
646        });
647
648        // Output projection
649        let output_projection = Array2::from_shape_fn((model_dim, model_dim), |_| {
650            (scirs2_core::random::rng().random::<f64>() - 0.5) * (2.0 / model_dim as f64).sqrt()
651        });
652
653        Self {
654            num_layers,
655            transformer_blocks,
656            position_encoding,
657            input_embedding,
658            output_projection,
659            model_dim,
660        }
661    }
662
663    /// Forward pass through transformer
664    pub fn forward(&mut self, input_sequence: &ArrayView2<f64>) -> OptimizeResult<Array2<f64>> {
665        let seq_len = input_sequence.nrows();
666        let input_dim = input_sequence.ncols();
667
668        // Input embedding
669        let mut embedded = Array2::zeros((seq_len, self.model_dim));
670        for i in 0..seq_len {
671            for j in 0..self.model_dim {
672                for k in 0..input_dim.min(self.input_embedding.ncols()) {
673                    embedded[[i, j]] += self.input_embedding[[j, k]] * input_sequence[[i, k]];
674                }
675            }
676        }
677
678        // Add positional encoding
679        for i in 0..seq_len.min(self.position_encoding.nrows()) {
680            for j in 0..self.model_dim.min(self.position_encoding.ncols()) {
681                embedded[[i, j]] += self.position_encoding[[i, j]];
682            }
683        }
684
685        // Pass through transformer blocks
686        let mut current = embedded;
687        for block in &mut self.transformer_blocks {
688            current = block.forward(&current.view())?;
689        }
690
691        // Output projection
692        let mut output = Array2::zeros((seq_len, self.model_dim));
693        for i in 0..seq_len {
694            for j in 0..self.model_dim {
695                for k in 0..self.model_dim.min(self.output_projection.ncols()) {
696                    output[[i, j]] += self.output_projection[[j, k]] * current[[i, k]];
697                }
698            }
699        }
700
701        Ok(output)
702    }
703
704    /// Create sinusoidal position encoding
705    fn create_position_encoding(_max_len: usize, model_dim: usize) -> Array2<f64> {
706        let mut pos_encoding = Array2::zeros((_max_len, model_dim));
707
708        for pos in 0.._max_len {
709            for i in 0..model_dim {
710                let angle = pos as f64 / 10000_f64.powf(2.0 * i as f64 / model_dim as f64);
711                if i % 2 == 0 {
712                    pos_encoding[[pos, i]] = angle.sin();
713                } else {
714                    pos_encoding[[pos, i]] = angle.cos();
715                }
716            }
717        }
718
719        pos_encoding
720    }
721}
722
723impl TransformerBlock {
724    /// Create new transformer block
725    pub fn new(num_heads: usize, model_dim: usize) -> Self {
726        let attention = MultiHeadAttention::new(num_heads, model_dim);
727        let feed_forward = FeedForwardNetwork::new(model_dim, model_dim * 4);
728        let layer_norm1 = LayerNormalization::new(model_dim);
729        let layer_norm2 = LayerNormalization::new(model_dim);
730
731        Self {
732            attention,
733            feed_forward,
734            layer_norm1,
735            layer_norm2,
736            dropout_rate: 0.1,
737        }
738    }
739
740    /// Forward pass through transformer block
741    pub fn forward(&mut self, input: &ArrayView2<f64>) -> OptimizeResult<Array2<f64>> {
742        // Multi-head attention with residual connection
743        let attention_output = self.attention.forward(input, input, input)?;
744        let residual1 = input + &attention_output.view();
745        let after_attention = self.layer_norm1.forward(&residual1.view())?;
746
747        // Feed-forward with residual connection
748        let ff_output = self.feed_forward.forward(&after_attention.view())?;
749        let residual2 = &after_attention + &ff_output.view();
750        let output = self.layer_norm2.forward(&residual2.view())?;
751
752        Ok(output)
753    }
754}
755
756impl MultiHeadAttention {
757    /// Create new multi-head attention
758    pub fn new(num_heads: usize, model_dim: usize) -> Self {
759        assert_eq!(model_dim % num_heads, 0);
760        let head_dim = model_dim / num_heads;
761
762        let w_query = Array2::from_shape_fn((model_dim, model_dim), |_| {
763            (scirs2_core::random::rng().random::<f64>() - 0.5) * (2.0 / model_dim as f64).sqrt()
764        });
765        let w_key = Array2::from_shape_fn((model_dim, model_dim), |_| {
766            (scirs2_core::random::rng().random::<f64>() - 0.5) * (2.0 / model_dim as f64).sqrt()
767        });
768        let w_value = Array2::from_shape_fn((model_dim, model_dim), |_| {
769            (scirs2_core::random::rng().random::<f64>() - 0.5) * (2.0 / model_dim as f64).sqrt()
770        });
771        let w_output = Array2::from_shape_fn((model_dim, model_dim), |_| {
772            (scirs2_core::random::rng().random::<f64>() - 0.5) * (2.0 / model_dim as f64).sqrt()
773        });
774
775        Self {
776            num_heads,
777            head_dim,
778            w_query,
779            w_key,
780            w_value,
781            w_output,
782            attention_scores: Vec::new(),
783        }
784    }
785
786    /// Forward pass through multi-head attention
787    pub fn forward(
788        &mut self,
789        query: &ArrayView2<f64>,
790        key: &ArrayView2<f64>,
791        value: &ArrayView2<f64>,
792    ) -> OptimizeResult<Array2<f64>> {
793        let seq_len = query.nrows();
794        let model_dim = query.ncols();
795
796        // Compute Q, K, V
797        let q = self.linear_transform(query, &self.w_query)?;
798        let k = self.linear_transform(key, &self.w_key)?;
799        let v = self.linear_transform(value, &self.w_value)?;
800
801        // Reshape for multi-head attention
802        let mut attention_output = Array2::zeros((seq_len, model_dim));
803
804        for head in 0..self.num_heads {
805            let head_start = head * self.head_dim;
806            let head_end = head_start + self.head_dim;
807
808            // Extract head-specific Q, K, V
809            let q_head = q.slice(scirs2_core::ndarray::s![.., head_start..head_end]);
810            let k_head = k.slice(scirs2_core::ndarray::s![.., head_start..head_end]);
811            let v_head = v.slice(scirs2_core::ndarray::s![.., head_start..head_end]);
812
813            // Compute attention scores
814            let scores = self.compute_attention_scores(&q_head, &k_head)?;
815
816            // Apply attention to values
817            let head_output = self.apply_attention(&scores, &v_head)?;
818
819            // Add to output
820            for i in 0..seq_len {
821                for j in 0..self.head_dim.min(model_dim - head_start) {
822                    attention_output[[i, head_start + j]] = head_output[[i, j]];
823                }
824            }
825        }
826
827        // Output projection
828        let output = self.linear_transform(&attention_output.view(), &self.w_output)?;
829
830        Ok(output)
831    }
832
833    fn linear_transform(
834        &self,
835        input: &ArrayView2<f64>,
836        weight: &Array2<f64>,
837    ) -> OptimizeResult<Array2<f64>> {
838        let seq_len = input.nrows();
839        let input_dim = input.ncols();
840        let output_dim = weight.nrows();
841
842        let mut output = Array2::zeros((seq_len, output_dim));
843
844        for i in 0..seq_len {
845            for j in 0..output_dim {
846                for k in 0..input_dim.min(weight.ncols()) {
847                    output[[i, j]] += weight[[j, k]] * input[[i, k]];
848                }
849            }
850        }
851
852        Ok(output)
853    }
854
855    fn compute_attention_scores(
856        &mut self,
857        query: &ArrayView2<f64>,
858        key: &ArrayView2<f64>,
859    ) -> OptimizeResult<Array2<f64>> {
860        let seq_len = query.nrows();
861        let head_dim = query.ncols();
862
863        let mut scores = Array2::zeros((seq_len, seq_len));
864        let scale = 1.0 / (head_dim as f64).sqrt();
865
866        for i in 0..seq_len {
867            for j in 0..seq_len {
868                let mut dot_product = 0.0;
869                for k in 0..head_dim {
870                    dot_product += query[[i, k]] * key[[j, k]];
871                }
872                scores[[i, j]] = dot_product * scale;
873            }
874        }
875
876        // Apply softmax
877        for i in 0..seq_len {
878            let mut row_sum = 0.0;
879            let max_val = scores.row(i).fold(-f64::INFINITY, |a, &b| a.max(b));
880
881            for j in 0..seq_len {
882                scores[[i, j]] = (scores[[i, j]] - max_val).exp();
883                row_sum += scores[[i, j]];
884            }
885
886            if row_sum > 0.0 {
887                for j in 0..seq_len {
888                    scores[[i, j]] /= row_sum;
889                }
890            }
891        }
892
893        // Store for analysis
894        self.attention_scores.push(scores.clone());
895        if self.attention_scores.len() > 10 {
896            self.attention_scores.remove(0);
897        }
898
899        Ok(scores)
900    }
901
902    fn apply_attention(
903        &self,
904        scores: &Array2<f64>,
905        values: &ArrayView2<f64>,
906    ) -> OptimizeResult<Array2<f64>> {
907        let seq_len = scores.nrows();
908        let head_dim = values.ncols();
909
910        let mut output = Array2::zeros((seq_len, head_dim));
911
912        for i in 0..seq_len {
913            for j in 0..head_dim {
914                for k in 0..seq_len {
915                    output[[i, j]] += scores[[i, k]] * values[[k, j]];
916                }
917            }
918        }
919
920        Ok(output)
921    }
922}
923
924impl FeedForwardNetwork {
925    /// Create new feed-forward network
926    pub fn new(input_dim: usize, hidden_dim: usize) -> Self {
927        let linear1 = Array2::from_shape_fn((hidden_dim, input_dim), |_| {
928            (scirs2_core::random::rng().random::<f64>() - 0.5) * (2.0 / input_dim as f64).sqrt()
929        });
930        let linear2 = Array2::from_shape_fn((input_dim, hidden_dim), |_| {
931            (scirs2_core::random::rng().random::<f64>() - 0.5) * (2.0 / hidden_dim as f64).sqrt()
932        });
933
934        Self {
935            linear1,
936            linear2,
937            bias1: Array1::zeros(hidden_dim),
938            bias2: Array1::zeros(input_dim),
939            activation: ActivationType::GELU,
940            hidden_dim,
941        }
942    }
943
944    /// Forward pass through feed-forward network
945    pub fn forward(&self, input: &ArrayView2<f64>) -> OptimizeResult<Array2<f64>> {
946        let seq_len = input.nrows();
947        let input_dim = input.ncols();
948
949        // First linear layer
950        let mut hidden = Array2::zeros((seq_len, self.hidden_dim));
951        for i in 0..seq_len {
952            for j in 0..self.hidden_dim {
953                for k in 0..input_dim.min(self.linear1.ncols()) {
954                    hidden[[i, j]] += self.linear1[[j, k]] * input[[i, k]];
955                }
956                hidden[[i, j]] += self.bias1[j];
957                hidden[[i, j]] = self.activation.apply(hidden[[i, j]]);
958            }
959        }
960
961        // Second linear layer
962        let mut output = Array2::zeros((seq_len, input_dim));
963        for i in 0..seq_len {
964            for j in 0..input_dim {
965                for k in 0..self.hidden_dim.min(self.linear2.ncols()) {
966                    output[[i, j]] += self.linear2[[j, k]] * hidden[[i, k]];
967                }
968                output[[i, j]] += self.bias2[j];
969            }
970        }
971
972        Ok(output)
973    }
974}
975
976impl LayerNormalization {
977    /// Create new layer normalization
978    pub fn new(dim: usize) -> Self {
979        Self {
980            gamma: Array1::ones(dim),
981            beta: Array1::zeros(dim),
982            epsilon: 1e-6,
983        }
984    }
985
986    /// Forward pass through layer normalization
987    pub fn forward(&self, input: &ArrayView2<f64>) -> OptimizeResult<Array2<f64>> {
988        let seq_len = input.nrows();
989        let dim = input.ncols();
990        let mut output = Array2::zeros((seq_len, dim));
991
992        for i in 0..seq_len {
993            // Compute mean and variance for this sequence position
994            let row = input.row(i);
995            let mean = row.mean();
996            let var = input.row(i).variance();
997            let std = (var + self.epsilon).sqrt();
998
999            // Normalize
1000            for j in 0..dim.min(self.gamma.len()) {
1001                output[[i, j]] = self.gamma[j] * (input[[i, j]] - mean) / std + self.beta[j];
1002            }
1003        }
1004
1005        Ok(output)
1006    }
1007}
1008
1009impl TransformerProblemEncoder {
1010    /// Create new transformer problem encoder
1011    pub fn new(embedding_dim: usize) -> Self {
1012        let feature_dim = 20; // Fixed feature dimension
1013
1014        Self {
1015            gradient_encoder: Array2::from_shape_fn((embedding_dim, feature_dim), |_| {
1016                (scirs2_core::random::rng().random::<f64>() - 0.5) * 0.1
1017            }),
1018            hessian_encoder: Array2::from_shape_fn((embedding_dim, feature_dim), |_| {
1019                (scirs2_core::random::rng().random::<f64>() - 0.5) * 0.1
1020            }),
1021            parameter_encoder: Array2::from_shape_fn((embedding_dim, feature_dim), |_| {
1022                (scirs2_core::random::rng().random::<f64>() - 0.5) * 0.1
1023            }),
1024            temporal_encoder: Array2::from_shape_fn((embedding_dim, feature_dim), |_| {
1025                (scirs2_core::random::rng().random::<f64>() - 0.5) * 0.1
1026            }),
1027            context_encoder: Array2::from_shape_fn((embedding_dim, feature_dim), |_| {
1028                (scirs2_core::random::rng().random::<f64>() - 0.5) * 0.1
1029            }),
1030            embedding_dim,
1031        }
1032    }
1033
1034    /// Encode current state for transformer
1035    pub fn encode_current_state<F>(
1036        &self,
1037        objective: &F,
1038        current_params: &ArrayView1<f64>,
1039        problem: &OptimizationProblem,
1040    ) -> OptimizeResult<Array1<f64>>
1041    where
1042        F: Fn(&ArrayView1<f64>) -> f64,
1043    {
1044        let mut encoding = Array1::zeros(self.embedding_dim);
1045
1046        // Encode different aspects
1047        let param_features = self.encode_parameter_features(current_params);
1048        let grad_features = self.encode_gradient_features(objective, current_params);
1049        let context_features = self.encode_context_features(problem);
1050
1051        // Combine features
1052        self.combine_features(&mut encoding, &param_features, &self.parameter_encoder);
1053        self.combine_features(&mut encoding, &grad_features, &self.gradient_encoder);
1054        self.combine_features(&mut encoding, &context_features, &self.context_encoder);
1055
1056        Ok(encoding)
1057    }
1058
1059    fn encode_parameter_features(&self, params: &ArrayView1<f64>) -> Array1<f64> {
1060        let mut features = Array1::zeros(20);
1061
1062        if !params.is_empty() {
1063            features[0] = params.view().mean().tanh();
1064            features[1] = params.view().variance().sqrt().tanh();
1065            features[2] = params.fold(-f64::INFINITY, |a, &b| a.max(b)).tanh();
1066            features[3] = params.fold(f64::INFINITY, |a, &b| a.min(b)).tanh();
1067            features[4] = (params.len() as f64).ln().tanh();
1068
1069            // L-norms
1070            features[5] =
1071                (params.iter().map(|&x| x.abs()).sum::<f64>() / params.len() as f64).tanh(); // L1
1072            features[6] = (params.iter().map(|&x| x * x).sum::<f64>()).sqrt().tanh(); // L2
1073
1074            // Statistical moments
1075            let mean = features[0];
1076            let skewness = params
1077                .iter()
1078                .map(|&x| ((x - mean) / (features[1] + 1e-8)).powi(3))
1079                .sum::<f64>()
1080                / params.len() as f64;
1081            features[7] = skewness.tanh();
1082
1083            // Sparsity
1084            let zero_count = params.iter().filter(|&&x| x.abs() < 1e-8).count();
1085            features[8] = (zero_count as f64 / params.len() as f64).tanh();
1086        }
1087
1088        features
1089    }
1090
1091    fn encode_gradient_features<F>(&self, objective: &F, params: &ArrayView1<f64>) -> Array1<f64>
1092    where
1093        F: Fn(&ArrayView1<f64>) -> f64,
1094    {
1095        let mut features = Array1::zeros(20);
1096
1097        let h = 1e-6;
1098        let f0 = objective(params);
1099        let mut gradient = Array1::zeros(params.len());
1100
1101        // Compute finite difference gradient
1102        for i in 0..params.len().min(20) {
1103            // Limit for efficiency
1104            let mut params_plus = params.to_owned();
1105            params_plus[i] += h;
1106            let f_plus = objective(&params_plus.view());
1107            gradient[i] = (f_plus - f0) / h;
1108        }
1109
1110        if !gradient.is_empty() {
1111            features[0] = (gradient.iter().map(|&g| g * g).sum::<f64>())
1112                .sqrt()
1113                .ln()
1114                .tanh(); // Gradient norm
1115            features[1] = f0.abs().ln().tanh(); // Objective magnitude
1116            features[2] = gradient.view().mean().tanh(); // Gradient mean
1117            features[3] = gradient.view().variance().sqrt().tanh(); // Gradient std
1118
1119            // Gradient direction consistency
1120            let grad_consistency = gradient
1121                .iter()
1122                .zip(params.iter())
1123                .map(|(&g, &p)| if p * g < 0.0 { 1.0 } else { 0.0 })
1124                .sum::<f64>()
1125                / gradient.len() as f64;
1126            features[4] = grad_consistency.tanh();
1127        }
1128
1129        features
1130    }
1131
1132    fn encode_context_features(&self, problem: &OptimizationProblem) -> Array1<f64> {
1133        let mut features = Array1::zeros(20);
1134
1135        features[0] = (problem.dimension as f64).ln().tanh();
1136        features[1] = (problem.max_evaluations as f64).ln().tanh();
1137        features[2] = problem.target_accuracy.ln().abs().tanh();
1138
1139        // Problem class encoding (simplified)
1140        match problem.problem_class.as_str() {
1141            "quadratic" => features[3] = 1.0,
1142            "neural_network" => features[4] = 1.0,
1143            "sparse" => {
1144                features[5] = 1.0;
1145                features[6] = 1.0;
1146            }
1147            _ => {} // Default case for unknown problem classes
1148        }
1149
1150        features
1151    }
1152
1153    fn combine_features(
1154        &self,
1155        encoding: &mut Array1<f64>,
1156        features: &Array1<f64>,
1157        encoder: &Array2<f64>,
1158    ) {
1159        for i in 0..encoding.len() {
1160            for j in 0..features.len().min(encoder.ncols()) {
1161                encoding[i] += encoder[[i, j]] * features[j];
1162            }
1163        }
1164    }
1165}
1166
1167impl OptimizationHistory {
1168    /// Create new optimization history
1169    pub fn new(max_length: usize) -> Self {
1170        Self {
1171            parameter_history: VecDeque::with_capacity(max_length),
1172            objective_history: VecDeque::with_capacity(max_length),
1173            gradient_history: VecDeque::with_capacity(max_length),
1174            step_size_history: VecDeque::with_capacity(max_length),
1175            success_history: VecDeque::with_capacity(max_length),
1176            max_length,
1177            current_step: 0,
1178        }
1179    }
1180
1181    /// Add optimization step to history
1182    pub fn add_step(&mut self, params: Array1<f64>, objective: f64, step: OptimizationStep) {
1183        if self.parameter_history.len() >= self.max_length {
1184            self.parameter_history.pop_front();
1185            self.objective_history.pop_front();
1186            self.gradient_history.pop_front();
1187            self.step_size_history.pop_front();
1188            self.success_history.pop_front();
1189        }
1190
1191        self.parameter_history.push_back(params);
1192        self.objective_history.push_back(objective);
1193        self.gradient_history.push_back(step.direction);
1194        self.step_size_history.push_back(step.step_size);
1195        self.success_history
1196            .push_back(step.convergence_confidence > 0.5);
1197
1198        self.current_step += 1;
1199    }
1200}
1201
1202impl AdaptiveComponents {
1203    /// Create new adaptive components
1204    pub fn new(model_dim: usize) -> Self {
1205        Self {
1206            attention_adaptation: AttentionAdaptation::new(model_dim),
1207            learning_rate_adapter: LearningRateAdapter::new(),
1208            gradient_scaler: GradientScaler::new(model_dim),
1209            step_size_predictor: StepSizePredictor::new(model_dim),
1210            convergence_detector: ConvergenceDetector::new(),
1211        }
1212    }
1213}
1214
1215impl AttentionAdaptation {
1216    /// Create new attention adaptation
1217    pub fn new(model_dim: usize) -> Self {
1218        Self {
1219            adaptation_rate: 0.01,
1220            attention_focus: Array1::from_elem(model_dim, 1.0 / model_dim as f64),
1221            focus_history: VecDeque::with_capacity(100),
1222            problem_patterns: HashMap::new(),
1223        }
1224    }
1225
1226    /// Update attention adaptation
1227    pub fn update(&mut self, attention_weights: &Array2<f64>) -> OptimizeResult<()> {
1228        if attention_weights.is_empty() {
1229            return Ok(());
1230        }
1231
1232        // Compute attention focus from _weights
1233        let mut new_focus = Array1::zeros(self.attention_focus.len());
1234        for i in 0..attention_weights.nrows().min(new_focus.len()) {
1235            new_focus[i] = attention_weights.row(i).mean();
1236        }
1237
1238        // Update focus with momentum
1239        for i in 0..self.attention_focus.len() {
1240            self.attention_focus[i] = (1.0 - self.adaptation_rate) * self.attention_focus[i]
1241                + self.adaptation_rate * new_focus.get(i).copied().unwrap_or(0.0);
1242        }
1243
1244        // Record in history
1245        self.focus_history.push_back(self.attention_focus.clone());
1246        if self.focus_history.len() > 100 {
1247            self.focus_history.pop_front();
1248        }
1249
1250        Ok(())
1251    }
1252
1253    /// Set specific focus pattern
1254    pub fn set_focus_pattern(&mut self, pattern: Array1<f64>) {
1255        if pattern.len() <= self.attention_focus.len() {
1256            for (i, &val) in pattern.iter().enumerate() {
1257                self.attention_focus[i] = val;
1258            }
1259        }
1260    }
1261}
1262
1263impl Default for LearningRateAdapter {
1264    fn default() -> Self {
1265        Self::new()
1266    }
1267}
1268
1269impl LearningRateAdapter {
1270    /// Create new learning rate adapter
1271    pub fn new() -> Self {
1272        Self {
1273            base_lr: 0.01,
1274            current_lr: 0.01,
1275            adaptation_params: Array1::from(vec![0.9, 0.1, 0.001]),
1276            performance_window: VecDeque::with_capacity(10),
1277            lr_history: Vec::new(),
1278        }
1279    }
1280
1281    /// Update learning rate
1282    pub fn update(&mut self, lr_factor: f64) -> OptimizeResult<()> {
1283        self.current_lr = self.base_lr * lr_factor;
1284        self.lr_history.push(self.current_lr);
1285
1286        Ok(())
1287    }
1288
1289    /// Get adaptation efficiency
1290    pub fn get_efficiency(&self) -> f64 {
1291        if self.lr_history.len() < 2 {
1292            return 0.5;
1293        }
1294
1295        // Simple efficiency metric based on LR stability
1296        let recent_changes: Vec<f64> = self
1297            .lr_history
1298            .windows(2)
1299            .map(|w| (w[1] - w[0]).abs())
1300            .collect();
1301
1302        let avg_change = recent_changes.iter().sum::<f64>() / recent_changes.len() as f64;
1303        (1.0 / (1.0 + avg_change)).min(1.0)
1304    }
1305}
1306
1307impl GradientScaler {
1308    /// Create new gradient scaler
1309    pub fn new(model_dim: usize) -> Self {
1310        Self {
1311            scale_factors: Array1::ones(model_dim),
1312            gradient_stats: GradientStatistics {
1313                mean: Array1::zeros(model_dim),
1314                variance: Array1::ones(model_dim),
1315                count: 0,
1316                momentum: 0.9,
1317            },
1318            scaling_params: Array1::from_elem(model_dim, 1.0),
1319        }
1320    }
1321}
1322
1323impl StepSizePredictor {
1324    /// Create new step size predictor
1325    pub fn new(feature_dim: usize) -> Self {
1326        Self {
1327            predictor_network: Array2::from_shape_fn((1, feature_dim), |_| {
1328                (scirs2_core::random::rng().random::<f64>() - 0.5) * 0.1
1329            }),
1330            feature_dim,
1331            prediction_history: Vec::new(),
1332            actual_steps: Vec::new(),
1333        }
1334    }
1335}
1336
1337impl Default for ConvergenceDetector {
1338    fn default() -> Self {
1339        Self::new()
1340    }
1341}
1342
1343impl ConvergenceDetector {
1344    /// Create new convergence detector
1345    pub fn new() -> Self {
1346        Self {
1347            threshold: 1e-6,
1348            window_size: 10,
1349            recent_improvements: VecDeque::with_capacity(10),
1350            convergence_prob: 0.0,
1351        }
1352    }
1353
1354    /// Update convergence detection
1355    pub fn update(&mut self, confidence: f64) -> OptimizeResult<()> {
1356        self.convergence_prob = 0.9 * self.convergence_prob + 0.1 * confidence;
1357        Ok(())
1358    }
1359
1360    /// Get detection accuracy
1361    pub fn get_accuracy(&self) -> f64 {
1362        self.convergence_prob
1363    }
1364}
1365
1366impl Default for TransformerMetrics {
1367    fn default() -> Self {
1368        Self {
1369            attention_entropy: 0.0,
1370            lr_adaptation_efficiency: 0.5,
1371            gradient_prediction_accuracy: 0.5,
1372            step_size_prediction_accuracy: 0.5,
1373            convergence_detection_accuracy: 0.5,
1374        }
1375    }
1376}
1377
1378impl LearnedOptimizer for AdaptiveTransformerOptimizer {
1379    fn meta_train(&mut self, training_tasks: &[TrainingTask]) -> OptimizeResult<()> {
1380        for task in training_tasks {
1381            self.adapt_to_problem_class(&task.problem.problem_class)?;
1382
1383            // Simulate optimization on task
1384            let initial_params = match &task.initial_distribution {
1385                super::ParameterDistribution::Uniform { low, high } => {
1386                    Array1::from_shape_fn(task.problem.dimension, |_| {
1387                        low + scirs2_core::random::rng().random::<f64>() * (high - low)
1388                    })
1389                }
1390                super::ParameterDistribution::Normal { mean, std } => {
1391                    Array1::from_shape_fn(task.problem.dimension, |_| {
1392                        mean + std * (scirs2_core::random::rng().random::<f64>() - 0.5) * 2.0
1393                    })
1394                }
1395                super::ParameterDistribution::Custom { samples } => {
1396                    if !samples.is_empty() {
1397                        samples[scirs2_core::random::rng().random_range(0..samples.len())].clone()
1398                    } else {
1399                        Array1::zeros(task.problem.dimension)
1400                    }
1401                }
1402            };
1403
1404            // Simple training objective for meta-learning
1405            let training_objective = |x: &ArrayView1<f64>| x.iter().map(|&xi| xi * xi).sum::<f64>();
1406
1407            // Process a few optimization steps
1408            for _ in 0..10 {
1409                let step = self.process_optimization_step(
1410                    &training_objective,
1411                    &initial_params.view(),
1412                    &task.problem,
1413                )?;
1414                self.update_performance_metrics();
1415            }
1416        }
1417
1418        Ok(())
1419    }
1420
1421    fn adapt_to_problem(
1422        &mut self,
1423        problem: &OptimizationProblem,
1424        initial_params: &ArrayView1<f64>,
1425    ) -> OptimizeResult<()> {
1426        self.adapt_to_problem_class(&problem.problem_class)
1427    }
1428
1429    fn optimize<F>(
1430        &mut self,
1431        objective: F,
1432        initial_params: &ArrayView1<f64>,
1433    ) -> OptimizeResult<OptimizeResults<f64>>
1434    where
1435        F: Fn(&ArrayView1<f64>) -> f64,
1436    {
1437        let mut current_params = initial_params.to_owned();
1438        let mut best_value = objective(initial_params);
1439        let mut iterations = 0;
1440
1441        // Create default problem for encoding
1442        let default_problem = OptimizationProblem {
1443            name: "unknown".to_string(),
1444            dimension: initial_params.len(),
1445            problem_class: "general".to_string(),
1446            metadata: HashMap::new(),
1447            max_evaluations: 1000,
1448            target_accuracy: 1e-6,
1449        };
1450
1451        for iter in 0..1000 {
1452            iterations = iter;
1453
1454            // Get optimization step from transformer
1455            let step = self.process_optimization_step(
1456                &objective,
1457                &current_params.view(),
1458                &default_problem,
1459            )?;
1460
1461            // Apply optimization step
1462            for i in 0..current_params.len().min(step.direction.len()) {
1463                current_params[i] -= step.step_size * step.direction[i];
1464            }
1465
1466            let current_value = objective(&current_params.view());
1467
1468            if current_value < best_value {
1469                best_value = current_value;
1470            }
1471
1472            // Check convergence
1473            if step.convergence_confidence > 0.95 || step.step_size < 1e-8 {
1474                break;
1475            }
1476        }
1477
1478        Ok(OptimizeResults::<f64> {
1479            x: current_params,
1480            fun: best_value,
1481            success: true,
1482            nit: iterations,
1483            message: "Transformer optimization completed".to_string(),
1484            ..OptimizeResults::default()
1485        })
1486    }
1487
1488    fn get_state(&self) -> &MetaOptimizerState {
1489        &self.meta_state
1490    }
1491
1492    fn reset(&mut self) {
1493        self.history_buffer = OptimizationHistory::new(100);
1494        self.performance_metrics = TransformerMetrics::default();
1495        self.meta_state.episode = 0;
1496    }
1497}
1498
1499/// Compute attention entropy for analysis
1500#[allow(dead_code)]
1501fn compute_attention_entropy(attention_scores: &Array2<f64>) -> f64 {
1502    let mut total_entropy = 0.0;
1503    let num_heads = attention_scores.nrows();
1504
1505    for i in 0..num_heads {
1506        let row = attention_scores.row(i);
1507        let entropy = -row
1508            .iter()
1509            .filter(|&&p| p > 1e-8)
1510            .map(|&p| p * p.ln())
1511            .sum::<f64>();
1512        total_entropy += entropy;
1513    }
1514
1515    total_entropy / num_heads as f64
1516}
1517
1518/// Convenience function for transformer-based optimization
1519#[allow(dead_code)]
1520pub fn transformer_optimize<F>(
1521    objective: F,
1522    initial_params: &ArrayView1<f64>,
1523    config: Option<LearnedOptimizationConfig>,
1524) -> super::OptimizeResult<OptimizeResults<f64>>
1525where
1526    F: Fn(&ArrayView1<f64>) -> f64,
1527{
1528    let config = config.unwrap_or_default();
1529    let mut optimizer = AdaptiveTransformerOptimizer::new(config);
1530    optimizer.optimize(objective, initial_params)
1531}
1532
1533#[cfg(test)]
1534mod tests {
1535    use super::*;
1536
1537    #[test]
1538    fn test_transformer_optimizer_creation() {
1539        let config = LearnedOptimizationConfig::default();
1540        let optimizer = AdaptiveTransformerOptimizer::new(config);
1541
1542        assert_eq!(optimizer.transformer.num_layers, 6);
1543        assert!(!optimizer.transformer.transformer_blocks.is_empty());
1544    }
1545
1546    #[test]
1547    fn test_optimization_transformer() {
1548        let transformer = OptimizationTransformer::new(4, 64, 100, 2);
1549
1550        assert_eq!(transformer.num_layers, 2);
1551        assert_eq!(transformer.model_dim, 64);
1552        assert_eq!(transformer.transformer_blocks.len(), 2);
1553    }
1554
1555    #[test]
1556    fn test_multi_head_attention() {
1557        let attention = MultiHeadAttention::new(4, 64);
1558
1559        assert_eq!(attention.num_heads, 4);
1560        assert_eq!(attention.head_dim, 16);
1561    }
1562
1563    #[test]
1564    fn test_transformer_forward_pass() {
1565        let mut transformer = OptimizationTransformer::new(2, 32, 10, 1);
1566        let input = Array2::from_shape_fn((5, 32), |_| scirs2_core::random::rng().random::<f64>());
1567
1568        let output = transformer.forward(&input.view()).unwrap();
1569
1570        assert_eq!(output.nrows(), 5);
1571        assert_eq!(output.ncols(), 32);
1572    }
1573
1574    #[test]
1575    fn test_problem_encoder() {
1576        let encoder = TransformerProblemEncoder::new(64);
1577        let params = Array1::from(vec![1.0, 2.0]);
1578        let objective = |x: &ArrayView1<f64>| x[0].powi(2) + x[1].powi(2);
1579
1580        let problem = OptimizationProblem {
1581            name: "test".to_string(),
1582            dimension: 2,
1583            problem_class: "quadratic".to_string(),
1584            metadata: HashMap::new(),
1585            max_evaluations: 1000,
1586            target_accuracy: 1e-6,
1587        };
1588
1589        let encoding = encoder
1590            .encode_current_state(&objective, &params.view(), &problem)
1591            .unwrap();
1592
1593        assert_eq!(encoding.len(), 64);
1594        assert!(encoding.iter().all(|&x| x.is_finite()));
1595    }
1596
1597    #[test]
1598    #[ignore = "timeout"]
1599    fn test_transformer_optimization() {
1600        let objective = |x: &ArrayView1<f64>| x[0].powi(2) + x[1].powi(2);
1601        let initial = Array1::from(vec![2.0, 2.0]);
1602
1603        let config = LearnedOptimizationConfig {
1604            meta_training_episodes: 5,
1605            hidden_size: 32,
1606            num_heads: 2,
1607            ..Default::default()
1608        };
1609
1610        let result = transformer_optimize(objective, &initial.view(), Some(config)).unwrap();
1611
1612        assert!(result.fun >= 0.0);
1613        assert_eq!(result.x.len(), 2);
1614        assert!(result.success);
1615    }
1616}
1617
1618#[allow(dead_code)]
1619pub fn placeholder() {
1620    // Placeholder function to prevent unused module warnings
1621}