1use super::{
8 ActivationType, LearnedOptimizationConfig, LearnedOptimizer, MetaOptimizerState,
9 OptimizationProblem, TrainingTask,
10};
11use crate::error::{OptimizeError, OptimizeResult};
12use crate::result::OptimizeResults;
13use scirs2_core::ndarray::{Array1, Array2, ArrayView1, ArrayView2};
14use scirs2_core::random::Rng;
15use statrs::statistics::Statistics;
16use std::collections::{HashMap, VecDeque};
17
18#[derive(Debug, Clone)]
20pub struct AdaptiveTransformerOptimizer {
21 config: LearnedOptimizationConfig,
23 transformer: OptimizationTransformer,
25 problem_encoder: TransformerProblemEncoder,
27 history_buffer: OptimizationHistory,
29 meta_state: MetaOptimizerState,
31 adaptive_components: AdaptiveComponents,
33 performance_metrics: TransformerMetrics,
35}
36
37#[derive(Debug, Clone)]
39pub struct OptimizationTransformer {
40 num_layers: usize,
42 transformer_blocks: Vec<TransformerBlock>,
44 position_encoding: Array2<f64>,
46 input_embedding: Array2<f64>,
48 output_projection: Array2<f64>,
50 model_dim: usize,
52}
53
54#[derive(Debug, Clone)]
56pub struct TransformerBlock {
57 attention: MultiHeadAttention,
59 feed_forward: FeedForwardNetwork,
61 layer_norm1: LayerNormalization,
63 layer_norm2: LayerNormalization,
65 dropout_rate: f64,
67}
68
69#[derive(Debug, Clone)]
71pub struct MultiHeadAttention {
72 num_heads: usize,
74 head_dim: usize,
76 w_query: Array2<f64>,
78 w_key: Array2<f64>,
80 w_value: Array2<f64>,
82 w_output: Array2<f64>,
84 attention_scores: Vec<Array2<f64>>,
86}
87
88#[derive(Debug, Clone)]
90pub struct FeedForwardNetwork {
91 linear1: Array2<f64>,
93 linear2: Array2<f64>,
95 bias1: Array1<f64>,
97 bias2: Array1<f64>,
99 activation: ActivationType,
101 hidden_dim: usize,
103}
104
105#[derive(Debug, Clone)]
107pub struct LayerNormalization {
108 gamma: Array1<f64>,
110 beta: Array1<f64>,
112 epsilon: f64,
114}
115
116#[derive(Debug, Clone)]
118pub struct TransformerProblemEncoder {
119 gradient_encoder: Array2<f64>,
121 hessian_encoder: Array2<f64>,
123 parameter_encoder: Array2<f64>,
125 temporal_encoder: Array2<f64>,
127 context_encoder: Array2<f64>,
129 embedding_dim: usize,
131}
132
133#[derive(Debug, Clone)]
135pub struct OptimizationHistory {
136 parameter_history: VecDeque<Array1<f64>>,
138 objective_history: VecDeque<f64>,
140 gradient_history: VecDeque<Array1<f64>>,
142 step_size_history: VecDeque<f64>,
144 success_history: VecDeque<bool>,
146 max_length: usize,
148 current_step: usize,
150}
151
152#[derive(Debug, Clone)]
154pub struct AdaptiveComponents {
155 attention_adaptation: AttentionAdaptation,
157 learning_rate_adapter: LearningRateAdapter,
159 gradient_scaler: GradientScaler,
161 step_size_predictor: StepSizePredictor,
163 convergence_detector: ConvergenceDetector,
165}
166
167#[derive(Debug, Clone)]
169pub struct AttentionAdaptation {
170 adaptation_rate: f64,
172 attention_focus: Array1<f64>,
174 focus_history: VecDeque<Array1<f64>>,
176 problem_patterns: HashMap<String, Array1<f64>>,
178}
179
180#[derive(Debug, Clone)]
182pub struct LearningRateAdapter {
183 base_lr: f64,
185 current_lr: f64,
187 adaptation_params: Array1<f64>,
189 performance_window: VecDeque<f64>,
191 lr_history: Vec<f64>,
193}
194
195#[derive(Debug, Clone)]
197pub struct GradientScaler {
198 scale_factors: Array1<f64>,
200 gradient_stats: GradientStatistics,
202 scaling_params: Array1<f64>,
204}
205
206#[derive(Debug, Clone)]
208pub struct GradientStatistics {
209 mean: Array1<f64>,
211 variance: Array1<f64>,
213 count: usize,
215 momentum: f64,
217}
218
219#[derive(Debug, Clone)]
221pub struct StepSizePredictor {
222 predictor_network: Array2<f64>,
224 feature_dim: usize,
226 prediction_history: Vec<f64>,
228 actual_steps: Vec<f64>,
230}
231
232#[derive(Debug, Clone)]
234pub struct ConvergenceDetector {
235 threshold: f64,
237 window_size: usize,
239 recent_improvements: VecDeque<f64>,
241 convergence_prob: f64,
243}
244
245#[derive(Debug, Clone)]
247pub struct TransformerMetrics {
248 attention_entropy: f64,
250 lr_adaptation_efficiency: f64,
252 gradient_prediction_accuracy: f64,
254 step_size_prediction_accuracy: f64,
256 convergence_detection_accuracy: f64,
258}
259
260impl AdaptiveTransformerOptimizer {
261 pub fn new(config: LearnedOptimizationConfig) -> Self {
263 let model_dim = config.hidden_size;
264 let transformer = OptimizationTransformer::new(
265 config.num_heads,
266 model_dim,
267 config.max_parameters,
268 6, );
270
271 let problem_encoder = TransformerProblemEncoder::new(model_dim);
272 let history_buffer = OptimizationHistory::new(100);
273
274 Self {
275 config,
276 transformer,
277 problem_encoder,
278 history_buffer,
279 meta_state: MetaOptimizerState {
280 meta_params: Array1::zeros(model_dim),
281 network_weights: Array2::zeros((model_dim, model_dim)),
282 performance_history: Vec::new(),
283 adaptation_stats: super::AdaptationStatistics::default(),
284 episode: 0,
285 },
286 adaptive_components: AdaptiveComponents::new(model_dim),
287 performance_metrics: TransformerMetrics::default(),
288 }
289 }
290
291 pub fn process_optimization_step<F>(
293 &mut self,
294 objective: &F,
295 current_params: &ArrayView1<f64>,
296 problem: &OptimizationProblem,
297 ) -> OptimizeResult<OptimizationStep>
298 where
299 F: Fn(&ArrayView1<f64>) -> f64,
300 {
301 let state_encoding = self.encode_optimization_state(objective, current_params, problem)?;
303
304 let transformer_output = self.transformer.forward(&state_encoding.view())?;
306
307 let optimization_step = self.decode_optimization_step(&transformer_output.view())?;
309
310 self.update_adaptive_components(&optimization_step)?;
312
313 self.history_buffer.add_step(
315 current_params.to_owned(),
316 objective(current_params),
317 optimization_step.clone(),
318 );
319
320 Ok(optimization_step)
321 }
322
323 fn encode_optimization_state<F>(
325 &self,
326 objective: &F,
327 current_params: &ArrayView1<f64>,
328 problem: &OptimizationProblem,
329 ) -> OptimizeResult<Array2<f64>>
330 where
331 F: Fn(&ArrayView1<f64>) -> f64,
332 {
333 let seq_len = self.history_buffer.current_step.min(50) + 1; let model_dim = self.transformer.model_dim;
335 let mut sequence = Array2::zeros((seq_len, model_dim));
336
337 for i in 0..seq_len - 1 {
339 if let Some(historical_encoding) = self.encode_historical_state(i) {
340 for j in 0..model_dim.min(historical_encoding.len()) {
341 sequence[[i, j]] = historical_encoding[j];
342 }
343 }
344 }
345
346 let current_encoding =
348 self.problem_encoder
349 .encode_current_state(objective, current_params, problem)?;
350
351 let last_idx = seq_len - 1;
352 for j in 0..model_dim.min(current_encoding.len()) {
353 sequence[[last_idx, j]] = current_encoding[j];
354 }
355
356 Ok(sequence)
357 }
358
359 fn encode_historical_state(&self, history_index: usize) -> Option<Array1<f64>> {
361 if history_index >= self.history_buffer.parameter_history.len() {
362 return None;
363 }
364
365 let params = &self.history_buffer.parameter_history[history_index];
366 let obj_val = self.history_buffer.objective_history[history_index];
367
368 let mut encoding = Array1::zeros(self.transformer.model_dim);
370
371 for (i, ¶m) in params.iter().enumerate() {
373 if i < encoding.len() / 4 {
374 encoding[i] = param.tanh();
375 }
376 }
377
378 let obj_idx = encoding.len() / 4;
380 if obj_idx < encoding.len() {
381 encoding[obj_idx] = obj_val.ln().abs().tanh();
382 }
383
384 if let Some(gradient) = self.history_buffer.gradient_history.get(history_index) {
386 let grad_start = encoding.len() / 2;
387 for (i, &grad) in gradient.iter().enumerate() {
388 if grad_start + i < encoding.len() {
389 encoding[grad_start + i] = grad.tanh();
390 }
391 }
392 }
393
394 Some(encoding)
395 }
396
397 fn decode_optimization_step(
399 &self,
400 transformer_output: &ArrayView2<f64>,
401 ) -> OptimizeResult<OptimizationStep> {
402 if transformer_output.is_empty() {
403 return Err(OptimizeError::InvalidInput(
404 "Empty transformer _output".to_string(),
405 ));
406 }
407
408 let last_output = transformer_output.row(transformer_output.nrows() - 1);
410
411 let step_size_raw = last_output.get(0).copied().unwrap_or(0.0);
413 let step_size = (step_size_raw.tanh() + 1.0) * 0.01; let direction_dim = self.meta_state.meta_params.len().min(last_output.len() - 1);
417 let mut direction = Array1::zeros(direction_dim);
418 for i in 0..direction_dim {
419 direction[i] = last_output.get(i + 1).copied().unwrap_or(0.0).tanh();
420 }
421
422 let lr_factor_raw = last_output
424 .get(last_output.len() / 2)
425 .copied()
426 .unwrap_or(0.0);
427 let lr_adaptation_factor = (lr_factor_raw.tanh() + 1.0) * 0.5 + 0.5; let conv_raw = last_output
431 .get(last_output.len() - 1)
432 .copied()
433 .unwrap_or(0.0);
434 let convergence_confidence = (conv_raw.tanh() + 1.0) * 0.5; Ok(OptimizationStep {
437 step_size,
438 direction,
439 lr_adaptation_factor,
440 convergence_confidence,
441 attention_weights: self.get_attention_weights(),
442 })
443 }
444
445 fn get_attention_weights(&self) -> Array2<f64> {
447 if let Some(first_block) = self.transformer.transformer_blocks.first() {
448 if let Some(last_attention) = first_block.attention.attention_scores.last() {
449 return last_attention.clone();
450 }
451 }
452 Array2::zeros((1, 1))
453 }
454
455 fn update_adaptive_components(&mut self, step: &OptimizationStep) -> OptimizeResult<()> {
457 self.adaptive_components
459 .attention_adaptation
460 .update(&step.attention_weights)?;
461
462 self.adaptive_components
464 .learning_rate_adapter
465 .update(step.lr_adaptation_factor)?;
466
467 self.adaptive_components
469 .convergence_detector
470 .update(step.convergence_confidence)?;
471
472 Ok(())
473 }
474
475 pub fn adapt_to_problem_class(&mut self, problem_class: &str) -> OptimizeResult<()> {
477 match problem_class {
479 "quadratic" => {
480 self.adaptive_components
482 .attention_adaptation
483 .set_focus_pattern(
484 Array1::from(vec![0.1, 0.2, 0.7]), );
486 }
487 "neural_network" => {
488 self.adaptive_components
490 .attention_adaptation
491 .set_focus_pattern(
492 Array1::from(vec![0.3, 0.4, 0.3]), );
494 }
495 "sparse" => {
496 self.adaptive_components
498 .attention_adaptation
499 .set_focus_pattern(
500 Array1::from(vec![0.5, 0.3, 0.2]), );
502 }
503 _ => {
504 self.adaptive_components
506 .attention_adaptation
507 .set_focus_pattern(Array1::from(vec![0.3, 0.4, 0.3]));
508 }
509 }
510
511 Ok(())
512 }
513
514 pub fn fine_tune_on_trajectories(
516 &mut self,
517 trajectories: &[OptimizationTrajectory],
518 ) -> OptimizeResult<()> {
519 for trajectory in trajectories {
520 for step in &trajectory.steps {
522 if step.improvement > 0.0 {
524 self.update_transformer_weights(&step.state_encoding, &step.action_encoding)?;
525 }
526 }
527 }
528
529 Ok(())
530 }
531
532 fn update_transformer_weights(
533 &mut self,
534 state_encoding: &Array2<f64>,
535 action_encoding: &Array1<f64>,
536 ) -> OptimizeResult<()> {
537 let learning_rate = self.config.meta_learning_rate;
539
540 for i in 0..self
542 .transformer
543 .output_projection
544 .nrows()
545 .min(action_encoding.len())
546 {
547 for j in 0..self.transformer.output_projection.ncols() {
548 if let Some(&state_val) = state_encoding.get((state_encoding.nrows() - 1, j)) {
549 self.transformer.output_projection[[i, j]] +=
550 learning_rate * action_encoding[i] * state_val;
551 }
552 }
553 }
554
555 Ok(())
556 }
557
558 pub fn get_performance_metrics(&self) -> &TransformerMetrics {
560 &self.performance_metrics
561 }
562
563 fn update_performance_metrics(&mut self) {
565 if let Some(attention_scores) = self.get_latest_attention_scores() {
567 self.performance_metrics.attention_entropy =
568 compute_attention_entropy(&attention_scores);
569 }
570
571 self.performance_metrics.lr_adaptation_efficiency = self
573 .adaptive_components
574 .learning_rate_adapter
575 .get_efficiency();
576
577 self.performance_metrics.convergence_detection_accuracy =
578 self.adaptive_components.convergence_detector.get_accuracy();
579 }
580
581 fn get_latest_attention_scores(&self) -> Option<Array2<f64>> {
582 self.transformer
583 .transformer_blocks
584 .first()?
585 .attention
586 .attention_scores
587 .last()
588 .cloned()
589 }
590}
591
592#[derive(Debug, Clone)]
594pub struct OptimizationStep {
595 pub step_size: f64,
597 pub direction: Array1<f64>,
599 pub lr_adaptation_factor: f64,
601 pub convergence_confidence: f64,
603 pub attention_weights: Array2<f64>,
605}
606
607#[derive(Debug, Clone)]
609pub struct OptimizationTrajectory {
610 pub steps: Vec<TrajectoryStep>,
612 pub final_objective: f64,
614 pub success: bool,
616}
617
618#[derive(Debug, Clone)]
620pub struct TrajectoryStep {
621 pub state_encoding: Array2<f64>,
623 pub action_encoding: Array1<f64>,
625 pub improvement: f64,
627 pub step_number: usize,
629}
630
631impl OptimizationTransformer {
632 pub fn new(num_heads: usize, model_dim: usize, max_seq_len: usize, num_layers: usize) -> Self {
634 let mut transformer_blocks = Vec::new();
635
636 for _ in 0..num_layers {
637 transformer_blocks.push(TransformerBlock::new(num_heads, model_dim));
638 }
639
640 let position_encoding = Self::create_position_encoding(max_seq_len, model_dim);
642
643 let input_embedding = Array2::from_shape_fn((model_dim, model_dim), |_| {
645 (scirs2_core::random::rng().random::<f64>() - 0.5) * (2.0 / model_dim as f64).sqrt()
646 });
647
648 let output_projection = Array2::from_shape_fn((model_dim, model_dim), |_| {
650 (scirs2_core::random::rng().random::<f64>() - 0.5) * (2.0 / model_dim as f64).sqrt()
651 });
652
653 Self {
654 num_layers,
655 transformer_blocks,
656 position_encoding,
657 input_embedding,
658 output_projection,
659 model_dim,
660 }
661 }
662
663 pub fn forward(&mut self, input_sequence: &ArrayView2<f64>) -> OptimizeResult<Array2<f64>> {
665 let seq_len = input_sequence.nrows();
666 let input_dim = input_sequence.ncols();
667
668 let mut embedded = Array2::zeros((seq_len, self.model_dim));
670 for i in 0..seq_len {
671 for j in 0..self.model_dim {
672 for k in 0..input_dim.min(self.input_embedding.ncols()) {
673 embedded[[i, j]] += self.input_embedding[[j, k]] * input_sequence[[i, k]];
674 }
675 }
676 }
677
678 for i in 0..seq_len.min(self.position_encoding.nrows()) {
680 for j in 0..self.model_dim.min(self.position_encoding.ncols()) {
681 embedded[[i, j]] += self.position_encoding[[i, j]];
682 }
683 }
684
685 let mut current = embedded;
687 for block in &mut self.transformer_blocks {
688 current = block.forward(¤t.view())?;
689 }
690
691 let mut output = Array2::zeros((seq_len, self.model_dim));
693 for i in 0..seq_len {
694 for j in 0..self.model_dim {
695 for k in 0..self.model_dim.min(self.output_projection.ncols()) {
696 output[[i, j]] += self.output_projection[[j, k]] * current[[i, k]];
697 }
698 }
699 }
700
701 Ok(output)
702 }
703
704 fn create_position_encoding(_max_len: usize, model_dim: usize) -> Array2<f64> {
706 let mut pos_encoding = Array2::zeros((_max_len, model_dim));
707
708 for pos in 0.._max_len {
709 for i in 0..model_dim {
710 let angle = pos as f64 / 10000_f64.powf(2.0 * i as f64 / model_dim as f64);
711 if i % 2 == 0 {
712 pos_encoding[[pos, i]] = angle.sin();
713 } else {
714 pos_encoding[[pos, i]] = angle.cos();
715 }
716 }
717 }
718
719 pos_encoding
720 }
721}
722
723impl TransformerBlock {
724 pub fn new(num_heads: usize, model_dim: usize) -> Self {
726 let attention = MultiHeadAttention::new(num_heads, model_dim);
727 let feed_forward = FeedForwardNetwork::new(model_dim, model_dim * 4);
728 let layer_norm1 = LayerNormalization::new(model_dim);
729 let layer_norm2 = LayerNormalization::new(model_dim);
730
731 Self {
732 attention,
733 feed_forward,
734 layer_norm1,
735 layer_norm2,
736 dropout_rate: 0.1,
737 }
738 }
739
740 pub fn forward(&mut self, input: &ArrayView2<f64>) -> OptimizeResult<Array2<f64>> {
742 let attention_output = self.attention.forward(input, input, input)?;
744 let residual1 = input + &attention_output.view();
745 let after_attention = self.layer_norm1.forward(&residual1.view())?;
746
747 let ff_output = self.feed_forward.forward(&after_attention.view())?;
749 let residual2 = &after_attention + &ff_output.view();
750 let output = self.layer_norm2.forward(&residual2.view())?;
751
752 Ok(output)
753 }
754}
755
756impl MultiHeadAttention {
757 pub fn new(num_heads: usize, model_dim: usize) -> Self {
759 assert_eq!(model_dim % num_heads, 0);
760 let head_dim = model_dim / num_heads;
761
762 let w_query = Array2::from_shape_fn((model_dim, model_dim), |_| {
763 (scirs2_core::random::rng().random::<f64>() - 0.5) * (2.0 / model_dim as f64).sqrt()
764 });
765 let w_key = Array2::from_shape_fn((model_dim, model_dim), |_| {
766 (scirs2_core::random::rng().random::<f64>() - 0.5) * (2.0 / model_dim as f64).sqrt()
767 });
768 let w_value = Array2::from_shape_fn((model_dim, model_dim), |_| {
769 (scirs2_core::random::rng().random::<f64>() - 0.5) * (2.0 / model_dim as f64).sqrt()
770 });
771 let w_output = Array2::from_shape_fn((model_dim, model_dim), |_| {
772 (scirs2_core::random::rng().random::<f64>() - 0.5) * (2.0 / model_dim as f64).sqrt()
773 });
774
775 Self {
776 num_heads,
777 head_dim,
778 w_query,
779 w_key,
780 w_value,
781 w_output,
782 attention_scores: Vec::new(),
783 }
784 }
785
786 pub fn forward(
788 &mut self,
789 query: &ArrayView2<f64>,
790 key: &ArrayView2<f64>,
791 value: &ArrayView2<f64>,
792 ) -> OptimizeResult<Array2<f64>> {
793 let seq_len = query.nrows();
794 let model_dim = query.ncols();
795
796 let q = self.linear_transform(query, &self.w_query)?;
798 let k = self.linear_transform(key, &self.w_key)?;
799 let v = self.linear_transform(value, &self.w_value)?;
800
801 let mut attention_output = Array2::zeros((seq_len, model_dim));
803
804 for head in 0..self.num_heads {
805 let head_start = head * self.head_dim;
806 let head_end = head_start + self.head_dim;
807
808 let q_head = q.slice(scirs2_core::ndarray::s![.., head_start..head_end]);
810 let k_head = k.slice(scirs2_core::ndarray::s![.., head_start..head_end]);
811 let v_head = v.slice(scirs2_core::ndarray::s![.., head_start..head_end]);
812
813 let scores = self.compute_attention_scores(&q_head, &k_head)?;
815
816 let head_output = self.apply_attention(&scores, &v_head)?;
818
819 for i in 0..seq_len {
821 for j in 0..self.head_dim.min(model_dim - head_start) {
822 attention_output[[i, head_start + j]] = head_output[[i, j]];
823 }
824 }
825 }
826
827 let output = self.linear_transform(&attention_output.view(), &self.w_output)?;
829
830 Ok(output)
831 }
832
833 fn linear_transform(
834 &self,
835 input: &ArrayView2<f64>,
836 weight: &Array2<f64>,
837 ) -> OptimizeResult<Array2<f64>> {
838 let seq_len = input.nrows();
839 let input_dim = input.ncols();
840 let output_dim = weight.nrows();
841
842 let mut output = Array2::zeros((seq_len, output_dim));
843
844 for i in 0..seq_len {
845 for j in 0..output_dim {
846 for k in 0..input_dim.min(weight.ncols()) {
847 output[[i, j]] += weight[[j, k]] * input[[i, k]];
848 }
849 }
850 }
851
852 Ok(output)
853 }
854
855 fn compute_attention_scores(
856 &mut self,
857 query: &ArrayView2<f64>,
858 key: &ArrayView2<f64>,
859 ) -> OptimizeResult<Array2<f64>> {
860 let seq_len = query.nrows();
861 let head_dim = query.ncols();
862
863 let mut scores = Array2::zeros((seq_len, seq_len));
864 let scale = 1.0 / (head_dim as f64).sqrt();
865
866 for i in 0..seq_len {
867 for j in 0..seq_len {
868 let mut dot_product = 0.0;
869 for k in 0..head_dim {
870 dot_product += query[[i, k]] * key[[j, k]];
871 }
872 scores[[i, j]] = dot_product * scale;
873 }
874 }
875
876 for i in 0..seq_len {
878 let mut row_sum = 0.0;
879 let max_val = scores.row(i).fold(-f64::INFINITY, |a, &b| a.max(b));
880
881 for j in 0..seq_len {
882 scores[[i, j]] = (scores[[i, j]] - max_val).exp();
883 row_sum += scores[[i, j]];
884 }
885
886 if row_sum > 0.0 {
887 for j in 0..seq_len {
888 scores[[i, j]] /= row_sum;
889 }
890 }
891 }
892
893 self.attention_scores.push(scores.clone());
895 if self.attention_scores.len() > 10 {
896 self.attention_scores.remove(0);
897 }
898
899 Ok(scores)
900 }
901
902 fn apply_attention(
903 &self,
904 scores: &Array2<f64>,
905 values: &ArrayView2<f64>,
906 ) -> OptimizeResult<Array2<f64>> {
907 let seq_len = scores.nrows();
908 let head_dim = values.ncols();
909
910 let mut output = Array2::zeros((seq_len, head_dim));
911
912 for i in 0..seq_len {
913 for j in 0..head_dim {
914 for k in 0..seq_len {
915 output[[i, j]] += scores[[i, k]] * values[[k, j]];
916 }
917 }
918 }
919
920 Ok(output)
921 }
922}
923
924impl FeedForwardNetwork {
925 pub fn new(input_dim: usize, hidden_dim: usize) -> Self {
927 let linear1 = Array2::from_shape_fn((hidden_dim, input_dim), |_| {
928 (scirs2_core::random::rng().random::<f64>() - 0.5) * (2.0 / input_dim as f64).sqrt()
929 });
930 let linear2 = Array2::from_shape_fn((input_dim, hidden_dim), |_| {
931 (scirs2_core::random::rng().random::<f64>() - 0.5) * (2.0 / hidden_dim as f64).sqrt()
932 });
933
934 Self {
935 linear1,
936 linear2,
937 bias1: Array1::zeros(hidden_dim),
938 bias2: Array1::zeros(input_dim),
939 activation: ActivationType::GELU,
940 hidden_dim,
941 }
942 }
943
944 pub fn forward(&self, input: &ArrayView2<f64>) -> OptimizeResult<Array2<f64>> {
946 let seq_len = input.nrows();
947 let input_dim = input.ncols();
948
949 let mut hidden = Array2::zeros((seq_len, self.hidden_dim));
951 for i in 0..seq_len {
952 for j in 0..self.hidden_dim {
953 for k in 0..input_dim.min(self.linear1.ncols()) {
954 hidden[[i, j]] += self.linear1[[j, k]] * input[[i, k]];
955 }
956 hidden[[i, j]] += self.bias1[j];
957 hidden[[i, j]] = self.activation.apply(hidden[[i, j]]);
958 }
959 }
960
961 let mut output = Array2::zeros((seq_len, input_dim));
963 for i in 0..seq_len {
964 for j in 0..input_dim {
965 for k in 0..self.hidden_dim.min(self.linear2.ncols()) {
966 output[[i, j]] += self.linear2[[j, k]] * hidden[[i, k]];
967 }
968 output[[i, j]] += self.bias2[j];
969 }
970 }
971
972 Ok(output)
973 }
974}
975
976impl LayerNormalization {
977 pub fn new(dim: usize) -> Self {
979 Self {
980 gamma: Array1::ones(dim),
981 beta: Array1::zeros(dim),
982 epsilon: 1e-6,
983 }
984 }
985
986 pub fn forward(&self, input: &ArrayView2<f64>) -> OptimizeResult<Array2<f64>> {
988 let seq_len = input.nrows();
989 let dim = input.ncols();
990 let mut output = Array2::zeros((seq_len, dim));
991
992 for i in 0..seq_len {
993 let row = input.row(i);
995 let mean = row.mean();
996 let var = input.row(i).variance();
997 let std = (var + self.epsilon).sqrt();
998
999 for j in 0..dim.min(self.gamma.len()) {
1001 output[[i, j]] = self.gamma[j] * (input[[i, j]] - mean) / std + self.beta[j];
1002 }
1003 }
1004
1005 Ok(output)
1006 }
1007}
1008
1009impl TransformerProblemEncoder {
1010 pub fn new(embedding_dim: usize) -> Self {
1012 let feature_dim = 20; Self {
1015 gradient_encoder: Array2::from_shape_fn((embedding_dim, feature_dim), |_| {
1016 (scirs2_core::random::rng().random::<f64>() - 0.5) * 0.1
1017 }),
1018 hessian_encoder: Array2::from_shape_fn((embedding_dim, feature_dim), |_| {
1019 (scirs2_core::random::rng().random::<f64>() - 0.5) * 0.1
1020 }),
1021 parameter_encoder: Array2::from_shape_fn((embedding_dim, feature_dim), |_| {
1022 (scirs2_core::random::rng().random::<f64>() - 0.5) * 0.1
1023 }),
1024 temporal_encoder: Array2::from_shape_fn((embedding_dim, feature_dim), |_| {
1025 (scirs2_core::random::rng().random::<f64>() - 0.5) * 0.1
1026 }),
1027 context_encoder: Array2::from_shape_fn((embedding_dim, feature_dim), |_| {
1028 (scirs2_core::random::rng().random::<f64>() - 0.5) * 0.1
1029 }),
1030 embedding_dim,
1031 }
1032 }
1033
1034 pub fn encode_current_state<F>(
1036 &self,
1037 objective: &F,
1038 current_params: &ArrayView1<f64>,
1039 problem: &OptimizationProblem,
1040 ) -> OptimizeResult<Array1<f64>>
1041 where
1042 F: Fn(&ArrayView1<f64>) -> f64,
1043 {
1044 let mut encoding = Array1::zeros(self.embedding_dim);
1045
1046 let param_features = self.encode_parameter_features(current_params);
1048 let grad_features = self.encode_gradient_features(objective, current_params);
1049 let context_features = self.encode_context_features(problem);
1050
1051 self.combine_features(&mut encoding, ¶m_features, &self.parameter_encoder);
1053 self.combine_features(&mut encoding, &grad_features, &self.gradient_encoder);
1054 self.combine_features(&mut encoding, &context_features, &self.context_encoder);
1055
1056 Ok(encoding)
1057 }
1058
1059 fn encode_parameter_features(&self, params: &ArrayView1<f64>) -> Array1<f64> {
1060 let mut features = Array1::zeros(20);
1061
1062 if !params.is_empty() {
1063 features[0] = params.view().mean().tanh();
1064 features[1] = params.view().variance().sqrt().tanh();
1065 features[2] = params.fold(-f64::INFINITY, |a, &b| a.max(b)).tanh();
1066 features[3] = params.fold(f64::INFINITY, |a, &b| a.min(b)).tanh();
1067 features[4] = (params.len() as f64).ln().tanh();
1068
1069 features[5] =
1071 (params.iter().map(|&x| x.abs()).sum::<f64>() / params.len() as f64).tanh(); features[6] = (params.iter().map(|&x| x * x).sum::<f64>()).sqrt().tanh(); let mean = features[0];
1076 let skewness = params
1077 .iter()
1078 .map(|&x| ((x - mean) / (features[1] + 1e-8)).powi(3))
1079 .sum::<f64>()
1080 / params.len() as f64;
1081 features[7] = skewness.tanh();
1082
1083 let zero_count = params.iter().filter(|&&x| x.abs() < 1e-8).count();
1085 features[8] = (zero_count as f64 / params.len() as f64).tanh();
1086 }
1087
1088 features
1089 }
1090
1091 fn encode_gradient_features<F>(&self, objective: &F, params: &ArrayView1<f64>) -> Array1<f64>
1092 where
1093 F: Fn(&ArrayView1<f64>) -> f64,
1094 {
1095 let mut features = Array1::zeros(20);
1096
1097 let h = 1e-6;
1098 let f0 = objective(params);
1099 let mut gradient = Array1::zeros(params.len());
1100
1101 for i in 0..params.len().min(20) {
1103 let mut params_plus = params.to_owned();
1105 params_plus[i] += h;
1106 let f_plus = objective(¶ms_plus.view());
1107 gradient[i] = (f_plus - f0) / h;
1108 }
1109
1110 if !gradient.is_empty() {
1111 features[0] = (gradient.iter().map(|&g| g * g).sum::<f64>())
1112 .sqrt()
1113 .ln()
1114 .tanh(); features[1] = f0.abs().ln().tanh(); features[2] = gradient.view().mean().tanh(); features[3] = gradient.view().variance().sqrt().tanh(); let grad_consistency = gradient
1121 .iter()
1122 .zip(params.iter())
1123 .map(|(&g, &p)| if p * g < 0.0 { 1.0 } else { 0.0 })
1124 .sum::<f64>()
1125 / gradient.len() as f64;
1126 features[4] = grad_consistency.tanh();
1127 }
1128
1129 features
1130 }
1131
1132 fn encode_context_features(&self, problem: &OptimizationProblem) -> Array1<f64> {
1133 let mut features = Array1::zeros(20);
1134
1135 features[0] = (problem.dimension as f64).ln().tanh();
1136 features[1] = (problem.max_evaluations as f64).ln().tanh();
1137 features[2] = problem.target_accuracy.ln().abs().tanh();
1138
1139 match problem.problem_class.as_str() {
1141 "quadratic" => features[3] = 1.0,
1142 "neural_network" => features[4] = 1.0,
1143 "sparse" => {
1144 features[5] = 1.0;
1145 features[6] = 1.0;
1146 }
1147 _ => {} }
1149
1150 features
1151 }
1152
1153 fn combine_features(
1154 &self,
1155 encoding: &mut Array1<f64>,
1156 features: &Array1<f64>,
1157 encoder: &Array2<f64>,
1158 ) {
1159 for i in 0..encoding.len() {
1160 for j in 0..features.len().min(encoder.ncols()) {
1161 encoding[i] += encoder[[i, j]] * features[j];
1162 }
1163 }
1164 }
1165}
1166
1167impl OptimizationHistory {
1168 pub fn new(max_length: usize) -> Self {
1170 Self {
1171 parameter_history: VecDeque::with_capacity(max_length),
1172 objective_history: VecDeque::with_capacity(max_length),
1173 gradient_history: VecDeque::with_capacity(max_length),
1174 step_size_history: VecDeque::with_capacity(max_length),
1175 success_history: VecDeque::with_capacity(max_length),
1176 max_length,
1177 current_step: 0,
1178 }
1179 }
1180
1181 pub fn add_step(&mut self, params: Array1<f64>, objective: f64, step: OptimizationStep) {
1183 if self.parameter_history.len() >= self.max_length {
1184 self.parameter_history.pop_front();
1185 self.objective_history.pop_front();
1186 self.gradient_history.pop_front();
1187 self.step_size_history.pop_front();
1188 self.success_history.pop_front();
1189 }
1190
1191 self.parameter_history.push_back(params);
1192 self.objective_history.push_back(objective);
1193 self.gradient_history.push_back(step.direction);
1194 self.step_size_history.push_back(step.step_size);
1195 self.success_history
1196 .push_back(step.convergence_confidence > 0.5);
1197
1198 self.current_step += 1;
1199 }
1200}
1201
1202impl AdaptiveComponents {
1203 pub fn new(model_dim: usize) -> Self {
1205 Self {
1206 attention_adaptation: AttentionAdaptation::new(model_dim),
1207 learning_rate_adapter: LearningRateAdapter::new(),
1208 gradient_scaler: GradientScaler::new(model_dim),
1209 step_size_predictor: StepSizePredictor::new(model_dim),
1210 convergence_detector: ConvergenceDetector::new(),
1211 }
1212 }
1213}
1214
1215impl AttentionAdaptation {
1216 pub fn new(model_dim: usize) -> Self {
1218 Self {
1219 adaptation_rate: 0.01,
1220 attention_focus: Array1::from_elem(model_dim, 1.0 / model_dim as f64),
1221 focus_history: VecDeque::with_capacity(100),
1222 problem_patterns: HashMap::new(),
1223 }
1224 }
1225
1226 pub fn update(&mut self, attention_weights: &Array2<f64>) -> OptimizeResult<()> {
1228 if attention_weights.is_empty() {
1229 return Ok(());
1230 }
1231
1232 let mut new_focus = Array1::zeros(self.attention_focus.len());
1234 for i in 0..attention_weights.nrows().min(new_focus.len()) {
1235 new_focus[i] = attention_weights.row(i).mean();
1236 }
1237
1238 for i in 0..self.attention_focus.len() {
1240 self.attention_focus[i] = (1.0 - self.adaptation_rate) * self.attention_focus[i]
1241 + self.adaptation_rate * new_focus.get(i).copied().unwrap_or(0.0);
1242 }
1243
1244 self.focus_history.push_back(self.attention_focus.clone());
1246 if self.focus_history.len() > 100 {
1247 self.focus_history.pop_front();
1248 }
1249
1250 Ok(())
1251 }
1252
1253 pub fn set_focus_pattern(&mut self, pattern: Array1<f64>) {
1255 if pattern.len() <= self.attention_focus.len() {
1256 for (i, &val) in pattern.iter().enumerate() {
1257 self.attention_focus[i] = val;
1258 }
1259 }
1260 }
1261}
1262
1263impl Default for LearningRateAdapter {
1264 fn default() -> Self {
1265 Self::new()
1266 }
1267}
1268
1269impl LearningRateAdapter {
1270 pub fn new() -> Self {
1272 Self {
1273 base_lr: 0.01,
1274 current_lr: 0.01,
1275 adaptation_params: Array1::from(vec![0.9, 0.1, 0.001]),
1276 performance_window: VecDeque::with_capacity(10),
1277 lr_history: Vec::new(),
1278 }
1279 }
1280
1281 pub fn update(&mut self, lr_factor: f64) -> OptimizeResult<()> {
1283 self.current_lr = self.base_lr * lr_factor;
1284 self.lr_history.push(self.current_lr);
1285
1286 Ok(())
1287 }
1288
1289 pub fn get_efficiency(&self) -> f64 {
1291 if self.lr_history.len() < 2 {
1292 return 0.5;
1293 }
1294
1295 let recent_changes: Vec<f64> = self
1297 .lr_history
1298 .windows(2)
1299 .map(|w| (w[1] - w[0]).abs())
1300 .collect();
1301
1302 let avg_change = recent_changes.iter().sum::<f64>() / recent_changes.len() as f64;
1303 (1.0 / (1.0 + avg_change)).min(1.0)
1304 }
1305}
1306
1307impl GradientScaler {
1308 pub fn new(model_dim: usize) -> Self {
1310 Self {
1311 scale_factors: Array1::ones(model_dim),
1312 gradient_stats: GradientStatistics {
1313 mean: Array1::zeros(model_dim),
1314 variance: Array1::ones(model_dim),
1315 count: 0,
1316 momentum: 0.9,
1317 },
1318 scaling_params: Array1::from_elem(model_dim, 1.0),
1319 }
1320 }
1321}
1322
1323impl StepSizePredictor {
1324 pub fn new(feature_dim: usize) -> Self {
1326 Self {
1327 predictor_network: Array2::from_shape_fn((1, feature_dim), |_| {
1328 (scirs2_core::random::rng().random::<f64>() - 0.5) * 0.1
1329 }),
1330 feature_dim,
1331 prediction_history: Vec::new(),
1332 actual_steps: Vec::new(),
1333 }
1334 }
1335}
1336
1337impl Default for ConvergenceDetector {
1338 fn default() -> Self {
1339 Self::new()
1340 }
1341}
1342
1343impl ConvergenceDetector {
1344 pub fn new() -> Self {
1346 Self {
1347 threshold: 1e-6,
1348 window_size: 10,
1349 recent_improvements: VecDeque::with_capacity(10),
1350 convergence_prob: 0.0,
1351 }
1352 }
1353
1354 pub fn update(&mut self, confidence: f64) -> OptimizeResult<()> {
1356 self.convergence_prob = 0.9 * self.convergence_prob + 0.1 * confidence;
1357 Ok(())
1358 }
1359
1360 pub fn get_accuracy(&self) -> f64 {
1362 self.convergence_prob
1363 }
1364}
1365
1366impl Default for TransformerMetrics {
1367 fn default() -> Self {
1368 Self {
1369 attention_entropy: 0.0,
1370 lr_adaptation_efficiency: 0.5,
1371 gradient_prediction_accuracy: 0.5,
1372 step_size_prediction_accuracy: 0.5,
1373 convergence_detection_accuracy: 0.5,
1374 }
1375 }
1376}
1377
1378impl LearnedOptimizer for AdaptiveTransformerOptimizer {
1379 fn meta_train(&mut self, training_tasks: &[TrainingTask]) -> OptimizeResult<()> {
1380 for task in training_tasks {
1381 self.adapt_to_problem_class(&task.problem.problem_class)?;
1382
1383 let initial_params = match &task.initial_distribution {
1385 super::ParameterDistribution::Uniform { low, high } => {
1386 Array1::from_shape_fn(task.problem.dimension, |_| {
1387 low + scirs2_core::random::rng().random::<f64>() * (high - low)
1388 })
1389 }
1390 super::ParameterDistribution::Normal { mean, std } => {
1391 Array1::from_shape_fn(task.problem.dimension, |_| {
1392 mean + std * (scirs2_core::random::rng().random::<f64>() - 0.5) * 2.0
1393 })
1394 }
1395 super::ParameterDistribution::Custom { samples } => {
1396 if !samples.is_empty() {
1397 samples[scirs2_core::random::rng().random_range(0..samples.len())].clone()
1398 } else {
1399 Array1::zeros(task.problem.dimension)
1400 }
1401 }
1402 };
1403
1404 let training_objective = |x: &ArrayView1<f64>| x.iter().map(|&xi| xi * xi).sum::<f64>();
1406
1407 for _ in 0..10 {
1409 let step = self.process_optimization_step(
1410 &training_objective,
1411 &initial_params.view(),
1412 &task.problem,
1413 )?;
1414 self.update_performance_metrics();
1415 }
1416 }
1417
1418 Ok(())
1419 }
1420
1421 fn adapt_to_problem(
1422 &mut self,
1423 problem: &OptimizationProblem,
1424 initial_params: &ArrayView1<f64>,
1425 ) -> OptimizeResult<()> {
1426 self.adapt_to_problem_class(&problem.problem_class)
1427 }
1428
1429 fn optimize<F>(
1430 &mut self,
1431 objective: F,
1432 initial_params: &ArrayView1<f64>,
1433 ) -> OptimizeResult<OptimizeResults<f64>>
1434 where
1435 F: Fn(&ArrayView1<f64>) -> f64,
1436 {
1437 let mut current_params = initial_params.to_owned();
1438 let mut best_value = objective(initial_params);
1439 let mut iterations = 0;
1440
1441 let default_problem = OptimizationProblem {
1443 name: "unknown".to_string(),
1444 dimension: initial_params.len(),
1445 problem_class: "general".to_string(),
1446 metadata: HashMap::new(),
1447 max_evaluations: 1000,
1448 target_accuracy: 1e-6,
1449 };
1450
1451 for iter in 0..1000 {
1452 iterations = iter;
1453
1454 let step = self.process_optimization_step(
1456 &objective,
1457 ¤t_params.view(),
1458 &default_problem,
1459 )?;
1460
1461 for i in 0..current_params.len().min(step.direction.len()) {
1463 current_params[i] -= step.step_size * step.direction[i];
1464 }
1465
1466 let current_value = objective(¤t_params.view());
1467
1468 if current_value < best_value {
1469 best_value = current_value;
1470 }
1471
1472 if step.convergence_confidence > 0.95 || step.step_size < 1e-8 {
1474 break;
1475 }
1476 }
1477
1478 Ok(OptimizeResults::<f64> {
1479 x: current_params,
1480 fun: best_value,
1481 success: true,
1482 nit: iterations,
1483 message: "Transformer optimization completed".to_string(),
1484 ..OptimizeResults::default()
1485 })
1486 }
1487
1488 fn get_state(&self) -> &MetaOptimizerState {
1489 &self.meta_state
1490 }
1491
1492 fn reset(&mut self) {
1493 self.history_buffer = OptimizationHistory::new(100);
1494 self.performance_metrics = TransformerMetrics::default();
1495 self.meta_state.episode = 0;
1496 }
1497}
1498
1499#[allow(dead_code)]
1501fn compute_attention_entropy(attention_scores: &Array2<f64>) -> f64 {
1502 let mut total_entropy = 0.0;
1503 let num_heads = attention_scores.nrows();
1504
1505 for i in 0..num_heads {
1506 let row = attention_scores.row(i);
1507 let entropy = -row
1508 .iter()
1509 .filter(|&&p| p > 1e-8)
1510 .map(|&p| p * p.ln())
1511 .sum::<f64>();
1512 total_entropy += entropy;
1513 }
1514
1515 total_entropy / num_heads as f64
1516}
1517
1518#[allow(dead_code)]
1520pub fn transformer_optimize<F>(
1521 objective: F,
1522 initial_params: &ArrayView1<f64>,
1523 config: Option<LearnedOptimizationConfig>,
1524) -> super::OptimizeResult<OptimizeResults<f64>>
1525where
1526 F: Fn(&ArrayView1<f64>) -> f64,
1527{
1528 let config = config.unwrap_or_default();
1529 let mut optimizer = AdaptiveTransformerOptimizer::new(config);
1530 optimizer.optimize(objective, initial_params)
1531}
1532
1533#[cfg(test)]
1534mod tests {
1535 use super::*;
1536
1537 #[test]
1538 fn test_transformer_optimizer_creation() {
1539 let config = LearnedOptimizationConfig::default();
1540 let optimizer = AdaptiveTransformerOptimizer::new(config);
1541
1542 assert_eq!(optimizer.transformer.num_layers, 6);
1543 assert!(!optimizer.transformer.transformer_blocks.is_empty());
1544 }
1545
1546 #[test]
1547 fn test_optimization_transformer() {
1548 let transformer = OptimizationTransformer::new(4, 64, 100, 2);
1549
1550 assert_eq!(transformer.num_layers, 2);
1551 assert_eq!(transformer.model_dim, 64);
1552 assert_eq!(transformer.transformer_blocks.len(), 2);
1553 }
1554
1555 #[test]
1556 fn test_multi_head_attention() {
1557 let attention = MultiHeadAttention::new(4, 64);
1558
1559 assert_eq!(attention.num_heads, 4);
1560 assert_eq!(attention.head_dim, 16);
1561 }
1562
1563 #[test]
1564 fn test_transformer_forward_pass() {
1565 let mut transformer = OptimizationTransformer::new(2, 32, 10, 1);
1566 let input = Array2::from_shape_fn((5, 32), |_| scirs2_core::random::rng().random::<f64>());
1567
1568 let output = transformer.forward(&input.view()).unwrap();
1569
1570 assert_eq!(output.nrows(), 5);
1571 assert_eq!(output.ncols(), 32);
1572 }
1573
1574 #[test]
1575 fn test_problem_encoder() {
1576 let encoder = TransformerProblemEncoder::new(64);
1577 let params = Array1::from(vec![1.0, 2.0]);
1578 let objective = |x: &ArrayView1<f64>| x[0].powi(2) + x[1].powi(2);
1579
1580 let problem = OptimizationProblem {
1581 name: "test".to_string(),
1582 dimension: 2,
1583 problem_class: "quadratic".to_string(),
1584 metadata: HashMap::new(),
1585 max_evaluations: 1000,
1586 target_accuracy: 1e-6,
1587 };
1588
1589 let encoding = encoder
1590 .encode_current_state(&objective, ¶ms.view(), &problem)
1591 .unwrap();
1592
1593 assert_eq!(encoding.len(), 64);
1594 assert!(encoding.iter().all(|&x| x.is_finite()));
1595 }
1596
1597 #[test]
1598 #[ignore = "timeout"]
1599 fn test_transformer_optimization() {
1600 let objective = |x: &ArrayView1<f64>| x[0].powi(2) + x[1].powi(2);
1601 let initial = Array1::from(vec![2.0, 2.0]);
1602
1603 let config = LearnedOptimizationConfig {
1604 meta_training_episodes: 5,
1605 hidden_size: 32,
1606 num_heads: 2,
1607 ..Default::default()
1608 };
1609
1610 let result = transformer_optimize(objective, &initial.view(), Some(config)).unwrap();
1611
1612 assert!(result.fun >= 0.0);
1613 assert_eq!(result.x.len(), 2);
1614 assert!(result.success);
1615 }
1616}
1617
1618#[allow(dead_code)]
1619pub fn placeholder() {
1620 }