1#[allow(dead_code)]
8use scirs2_core::ndarray::{s, Array, Array1, Array2, ArrayBase, Data, Dimension};
9use scirs2_core::numeric::Float;
10use scirs2_core::random::Rng;
11use std::collections::{HashMap, VecDeque};
12use std::fmt::Debug;
13
14use super::{LearnedOptimizerConfig, MetaOptimizationStrategy};
15use crate::error::{OptimError, Result};
16
17#[derive(Debug)]
19pub struct LSTMOptimizer<T: Float + Debug + Send + Sync + 'static> {
20 config: LearnedOptimizerConfig,
22
23 lstm_network: LSTMNetwork<T>,
25
26 history_buffer: HistoryBuffer<T>,
28
29 meta_learner: MetaLearner<T>,
31
32 lr_controller: AdaptiveLearningRateController<T>,
34
35 state_tracker: OptimizationStateTracker<T>,
37
38 metrics: LSTMOptimizerMetrics,
40
41 step_count: usize,
43
44 rng: scirs2_core::random::CoreRandom,
46}
47
48#[derive(Debug, Clone)]
50pub struct LSTMNetwork<T: Float + Debug + Send + Sync + 'static> {
51 layers: Vec<LSTMLayer<T>>,
53
54 output_projection: OutputProjection<T>,
56
57 attention: Option<AttentionMechanism<T>>,
59
60 layer_norms: Vec<LayerNormalization<T>>,
62
63 dropout_rate: f64,
65}
66
67#[derive(Debug, Clone)]
69pub struct LSTMLayer<T: Float + Debug + Send + Sync + 'static> {
70 weight_ih: Array2<T>,
72
73 weight_hh: Array2<T>,
75
76 bias_ih: Array1<T>,
78
79 bias_hh: Array1<T>,
81
82 hidden_state: Array1<T>,
84
85 cell_state: Array1<T>,
87
88 hiddensize: usize,
90}
91
92#[derive(Debug, Clone)]
94pub struct OutputProjection<T: Float + Debug + Send + Sync + 'static> {
95 weights: Array2<T>,
97
98 bias: Array1<T>,
100
101 output_transform: OutputTransform,
103}
104
105impl<T: Float + Debug + Send + Sync + 'static + Default + Clone> OutputProjection<T> {
106 pub fn new(
108 input_size: usize,
109 output_size: usize,
110 output_transform: OutputTransform,
111 ) -> Result<Self> {
112 let weights = Array2::zeros((output_size, input_size));
113 let bias = Array1::zeros(output_size);
114
115 Ok(Self {
116 weights,
117 bias,
118 output_transform,
119 })
120 }
121
122 pub fn forward(&self, input: &Array1<T>) -> Result<Array1<T>> {
124 Ok(input.clone())
126 }
127}
128
129#[derive(Debug, Clone, Copy)]
131pub enum OutputTransform {
132 Identity,
134
135 Tanh,
137
138 ScaledTanh { scale: f64 },
140
141 AdaptiveScale,
143
144 LearnedNonlinear,
146}
147
148#[derive(Debug, Clone)]
150pub struct AttentionMechanism<T: Float + Debug + Send + Sync + 'static> {
151 query_proj: Array2<T>,
153
154 key_proj: Array2<T>,
156
157 value_proj: Array2<T>,
159
160 output_proj: Array2<T>,
162
163 num_heads: usize,
165
166 head_size: usize,
168
169 attentionweights: Option<Array2<T>>,
171}
172
173impl<T: Float + Debug + Send + Sync + 'static + Default + Clone> AttentionMechanism<T> {
174 pub fn new(config: &LearnedOptimizerConfig) -> Result<Self> {
176 let hiddensize = config.hidden_size;
177 let num_heads = config.attention_heads;
178 let head_size = hiddensize / num_heads;
179
180 Ok(Self {
181 query_proj: Array2::zeros((hiddensize, hiddensize)),
182 key_proj: Array2::zeros((hiddensize, hiddensize)),
183 value_proj: Array2::zeros((hiddensize, hiddensize)),
184 output_proj: Array2::zeros((hiddensize, hiddensize)),
185 num_heads,
186 head_size,
187 attentionweights: None,
188 })
189 }
190
191 pub fn forward(&mut self, input: &Array1<T>) -> Result<Array1<T>> {
193 Ok(input.clone())
195 }
196}
197
198#[derive(Debug, Clone)]
200pub struct LayerNormalization<T: Float + Debug + Send + Sync + 'static> {
201 gamma: Array1<T>,
203
204 beta: Array1<T>,
206
207 epsilon: T,
209}
210
211impl<T: Float + Debug + Send + Sync + 'static + Default + Clone> LayerNormalization<T> {
212 pub fn new(features: usize) -> Result<Self> {
214 Ok(Self {
215 gamma: Array1::ones(features),
216 beta: Array1::zeros(features),
217 epsilon: scirs2_core::numeric::NumCast::from(1e-5).unwrap_or_else(|| T::zero()),
218 })
219 }
220
221 pub fn forward(&self, input: &Array1<T>) -> Result<Array1<T>> {
223 Ok(input.clone())
225 }
226}
227
228#[derive(Debug, Clone)]
230pub struct HistoryBuffer<T: Float + Debug + Send + Sync + 'static> {
231 gradients: VecDeque<Array1<T>>,
233
234 parameters: VecDeque<Array1<T>>,
236
237 losses: VecDeque<T>,
239
240 learning_rates: VecDeque<T>,
242
243 update_magnitudes: VecDeque<T>,
245
246 _maxlength: usize,
248
249 feature_cache: Option<Array2<T>>,
251}
252
253#[derive(Debug, Clone)]
255pub struct MetaLearner<T: Float + Debug + Send + Sync + 'static> {
256 strategy: MetaOptimizationStrategy,
258
259 meta_parameters: HashMap<String, Array1<T>>,
261
262 meta_gradients: HashMap<String, Array1<T>>,
264
265 task_history: VecDeque<MetaTask<T>>,
267
268 meta_state: MetaLearningState<T>,
270
271 transfer_learner: TransferLearner<T>,
273}
274
275#[derive(Debug, Clone)]
277pub struct MetaTask<T: Float + Debug + Send + Sync + 'static> {
278 pub id: String,
280
281 pub task_type: TaskType,
283
284 pub training_trajectory: Vec<TrajectoryPoint<T>>,
286
287 pub final_performance: T,
289
290 pub characteristics: TaskCharacteristics<T>,
292
293 pub weight: T,
295}
296
297#[derive(Debug, Clone, Copy)]
299pub enum TaskType {
300 SupervisedLearning,
302
303 ReinforcementLearning,
305
306 UnsupervisedLearning,
308
309 FewShotLearning,
311
312 OnlineLearning,
314
315 AdversarialTraining,
317}
318
319#[derive(Debug, Clone)]
321pub struct TrajectoryPoint<T: Float + Debug + Send + Sync + 'static> {
322 pub step: usize,
324
325 pub gradient: Array1<T>,
327
328 pub parameters: Array1<T>,
330
331 pub loss: T,
333
334 pub learning_rate: T,
336
337 pub update: Array1<T>,
339}
340
341#[derive(Debug, Clone)]
343pub struct TaskCharacteristics<T: Float + Debug + Send + Sync + 'static> {
344 pub dimensionality: usize,
346
347 pub curvature: T,
349
350 pub noise_level: T,
352
353 pub conditioning: T,
355
356 pub difficulty: T,
358
359 pub domain_features: Array1<T>,
361}
362
363#[derive(Debug, Clone)]
365pub struct MetaLearningState<T: Float + Debug + Send + Sync + 'static> {
366 pub meta_step: usize,
368
369 pub meta_lr: T,
371
372 pub adaptation_rate: T,
374
375 pub meta_validation_performance: T,
377
378 pub adaptation_history: VecDeque<AdaptationEvent<T>>,
380
381 pub inner_loop_state: InnerLoopState<T>,
383}
384
385#[derive(Debug, Clone)]
387pub struct AdaptationEvent<T: Float + Debug + Send + Sync + 'static> {
388 pub source_task: String,
390
391 pub target_task: String,
393
394 pub adaptation_steps: usize,
396
397 pub transfer_efficiency: T,
399
400 pub performance_improvement: T,
402}
403
404#[derive(Debug, Clone)]
406pub struct InnerLoopState<T: Float + Debug + Send + Sync + 'static> {
407 pub inner_step: usize,
409
410 pub inner_parameters: Array1<T>,
412
413 pub inner_optimizer_state: HashMap<String, Array1<T>>,
415
416 pub inner_performance: T,
418}
419
420#[derive(Debug, Clone)]
422pub struct TransferLearner<T: Float + Debug + Send + Sync + 'static> {
423 pub source_knowledge: HashMap<String, Array1<T>>,
425
426 pub adaptation_parameters: Array1<T>,
428
429 pub transfer_metrics: TransferMetrics<T>,
431
432 pub similarity_estimator: DomainSimilarityEstimator<T>,
434}
435
436#[derive(Debug, Clone)]
438pub struct TransferMetrics<T: Float + Debug + Send + Sync + 'static> {
439 pub efficiency: T,
441
442 pub adaptation_speed: T,
444
445 pub knowledge_retention: T,
447
448 pub negative_transfer_score: T,
450}
451
452#[derive(Debug, Clone)]
454pub struct DomainSimilarityEstimator<T: Float + Debug + Send + Sync + 'static> {
455 pub domain_embeddings: HashMap<String, Array1<T>>,
457
458 pub similarity_params: Array1<T>,
460
461 pub similarity_function: SimilarityFunction,
463}
464
465#[derive(Debug, Clone, Copy)]
467pub enum SimilarityFunction {
468 Cosine,
470
471 Euclidean,
473
474 LearnedMetric,
476
477 TaskSpecific,
479}
480
481#[derive(Debug, Clone)]
483pub struct AdaptiveLearningRateController<T: Float + Debug + Send + Sync + 'static> {
484 base_lr: T,
486
487 current_lr: T,
489
490 adaptation_params: LRAdaptationParams<T>,
492
493 lr_history: VecDeque<T>,
495
496 performance_tracker: PerformanceTracker<T>,
498
499 schedule_params: Option<Array1<T>>,
501}
502
503#[derive(Debug, Clone)]
505pub struct LRAdaptationParams<T: Float + Debug + Send + Sync + 'static> {
506 pub momentum: T,
508
509 pub gradient_sensitivity: T,
511
512 pub loss_sensitivity: T,
514
515 pub min_lr: T,
517
518 pub max_lr: T,
520
521 pub adaptation_rate: T,
523}
524
525#[derive(Debug, Clone)]
527pub struct PerformanceTracker<T: Float + Debug + Send + Sync + 'static> {
528 recent_losses: VecDeque<T>,
530
531 trend: PerformanceTrend,
533
534 stagnation_counter: usize,
536
537 best_performance: T,
539
540 improvement_rate: T,
542}
543
544#[derive(Debug, Clone, Copy)]
546pub enum PerformanceTrend {
547 Improving,
549
550 Stagnating,
552
553 Degrading,
555
556 Oscillating,
558
559 Unknown,
561}
562
563#[derive(Debug, Clone)]
565pub struct OptimizationStateTracker<T: Float + Debug + Send + Sync + 'static> {
566 phase: OptimizationPhase,
568
569 convergence_indicators: ConvergenceIndicators<T>,
571
572 gradient_analyzer: GradientAnalyzer<T>,
574
575 landscape_analyzer: LossLandscapeAnalyzer<T>,
577
578 stability_metrics: StabilityMetrics<T>,
580}
581
582#[derive(Debug, Clone, Copy)]
584pub enum OptimizationPhase {
585 InitialDescent,
587
588 SteadyProgress,
590
591 FineTuning,
593
594 Converged,
596
597 Plateau,
599
600 Diverging,
602}
603
604#[derive(Debug, Clone)]
606pub struct ConvergenceIndicators<T: Float + Debug + Send + Sync + 'static> {
607 pub gradient_norm_trend: Vec<T>,
609
610 pub loss_change_trend: Vec<T>,
612
613 pub parameter_change_magnitude: T,
615
616 pub convergence_probability: T,
618
619 pub estimated_steps_to_convergence: Option<usize>,
621}
622
623#[derive(Debug, Clone)]
625pub struct GradientAnalyzer<T: Float + Debug + Send + Sync + 'static> {
626 pub gradient_stats: GradientStatistics<T>,
628
629 pub correlation_tracker: GradientCorrelationTracker<T>,
631
632 pub noise_estimator: GradientNoiseEstimator<T>,
634
635 pub flow_analyzer: GradientFlowAnalyzer<T>,
637}
638
639#[derive(Debug, Clone)]
641pub struct GradientStatistics<T: Float + Debug + Send + Sync + 'static> {
642 pub mean_norm: T,
644
645 pub norm_variance: T,
647
648 pub direction_consistency: T,
650
651 pub magnitude_distribution: Vec<T>,
653
654 pub component_stats: Array1<T>,
656}
657
658#[derive(Debug, Clone)]
660pub struct GradientCorrelationTracker<T: Float + Debug + Send + Sync + 'static> {
661 pub correlation_matrix: Array2<T>,
663
664 pub temporal_correlations: VecDeque<T>,
666
667 pub cross_correlations: HashMap<String, T>,
669}
670
671#[derive(Debug, Clone)]
673pub struct GradientNoiseEstimator<T: Float + Debug + Send + Sync + 'static> {
674 pub noise_level: T,
676
677 pub signal_to_noise_ratio: T,
679
680 pub noise_characteristics: NoiseCharacteristics<T>,
682}
683
684#[derive(Debug, Clone)]
686pub struct NoiseCharacteristics<T: Float + Debug + Send + Sync + 'static> {
687 pub noise_type: NoiseType,
689
690 pub scale: T,
692
693 pub temporal_correlation: T,
695
696 pub spatial_correlation: T,
698}
699
700#[derive(Debug, Clone, Copy)]
702pub enum NoiseType {
703 White,
705
706 Colored,
708
709 Structured,
711
712 Adaptive,
714}
715
716#[derive(Debug, Clone)]
718pub struct GradientFlowAnalyzer<T: Float + Debug + Send + Sync + 'static> {
719 pub flow_field: Array2<T>,
721
722 pub critical_points: Vec<Array1<T>>,
724
725 pub stability: FlowStability,
727
728 pub attractors: Vec<Array1<T>>,
730 pub repellers: Vec<Array1<T>>,
731}
732
733#[derive(Debug, Clone, Copy)]
735pub enum FlowStability {
736 Stable,
738
739 Unstable,
741
742 Chaotic,
744
745 Unknown,
747}
748
749#[derive(Debug, Clone)]
751pub struct LossLandscapeAnalyzer<T: Float + Debug + Send + Sync + 'static> {
752 pub local_curvature: T,
754
755 pub hessian_eigenvalues: Option<Array1<T>>,
757
758 pub roughness: T,
760
761 pub basin_size: T,
763
764 pub barrier_heights: Vec<T>,
766}
767
768#[derive(Debug, Clone)]
770pub struct StabilityMetrics<T: Float + Debug + Send + Sync + 'static> {
771 pub lyapunov_exponents: Array1<T>,
773
774 pub stability_margin: T,
776
777 pub perturbation_sensitivity: T,
779
780 pub robustness_score: T,
782}
783
784#[derive(Debug, Clone)]
786pub struct LSTMOptimizerMetrics {
787 pub meta_learning_loss: f64,
789
790 pub avg_convergence_speed: f64,
792
793 pub generalization_performance: f64,
795
796 pub adaptation_efficiency: f64,
798
799 pub transfer_success_rate: f64,
801
802 pub memory_usage_mb: f64,
804
805 pub computational_overhead: f64,
807
808 pub lstm_stats: LSTMNetworkStats,
810
811 pub attention_stats: Option<AttentionStats>,
813}
814
815#[derive(Debug, Clone)]
817pub struct LSTMNetworkStats {
818 pub gate_activations: GateActivationStats,
820
821 pub hidden_state_stats: StateStatistics,
823
824 pub cell_state_stats: StateStatistics,
826
827 pub gradient_flow_stats: GradientFlowStats,
829}
830
831#[derive(Debug, Clone)]
833pub struct GateActivationStats {
834 pub input_gate: StateStatistics,
836
837 pub forget_gate: StateStatistics,
839
840 pub output_gate: StateStatistics,
842
843 pub cell_gate: StateStatistics,
845}
846
847#[derive(Debug, Clone)]
849pub struct StateStatistics {
850 pub mean: f64,
852
853 pub std: f64,
855
856 pub min: f64,
858
859 pub max: f64,
861
862 pub saturation_percent: f64,
864}
865
866#[derive(Debug, Clone)]
868pub struct GradientFlowStats {
869 pub layer_gradient_norms: Vec<f64>,
871
872 pub layer_correlations: Vec<f64>,
874
875 pub vanishing_gradient_score: f64,
877
878 pub exploding_gradient_score: f64,
880}
881
882#[derive(Debug, Clone)]
884pub struct AttentionStats {
885 pub attention_entropy: f64,
887
888 pub attention_concentration: f64,
890
891 pub head_diversity: f64,
893
894 pub temporal_patterns: Vec<f64>,
896}
897
898impl<
899 T: Float
900 + Default
901 + Clone
902 + Send
903 + Sync
904 + std::iter::Sum
905 + for<'a> std::iter::Sum<&'a T>
906 + scirs2_core::ndarray::ScalarOperand
907 + std::fmt::Debug,
908 > LSTMOptimizer<T>
909{
910 pub fn new(config: LearnedOptimizerConfig) -> Result<Self> {
912 Self::validate_config(&config)?;
914
915 let lstm_network = LSTMNetwork::new(&config)?;
917
918 let history_buffer = HistoryBuffer::new(config.gradient_history_size);
920
921 let meta_learner = MetaLearner::new(&config)?;
923
924 let lr_controller = AdaptiveLearningRateController::new(&config)?;
926
927 let state_tracker = OptimizationStateTracker::new();
929
930 let metrics = LSTMOptimizerMetrics::new();
932
933 let rng = scirs2_core::random::thread_rng();
935
936 Ok(Self {
937 config,
938 lstm_network,
939 history_buffer,
940 meta_learner,
941 lr_controller,
942 state_tracker,
943 metrics,
944 step_count: 0,
945 rng,
946 })
947 }
948
949 pub fn lstm_step<S, D>(
951 &mut self,
952 parameters: &ArrayBase<S, D>,
953 gradients: &ArrayBase<S, D>,
954 loss: Option<T>,
955 ) -> Result<Array<T, D>>
956 where
957 S: Data<Elem = T>,
958 D: Dimension + Clone,
959 {
960 let flat_params = self.flatten_to_1d(parameters)?;
962 let flat_gradients = self.flatten_to_1d(gradients)?;
963
964 self.history_buffer
966 .update(&flat_params, &flat_gradients, loss);
967
968 let lstm_input = self.prepare_lstm_input(&flat_gradients)?;
970
971 let lstm_output = self.lstm_network.forward(&lstm_input)?;
973
974 let learning_rate =
976 self.lr_controller
977 .compute_lr(&flat_gradients, loss, &self.history_buffer)?;
978
979 let updates = self.generate_updates(&lstm_output, &flat_gradients, learning_rate)?;
981
982 let updated_flat = &flat_params - &updates;
984
985 self.state_tracker.update(&flat_gradients, &updates, loss);
987
988 self.update_metrics(&flat_gradients, &updates, learning_rate);
990
991 let updated_params = self.reshape_from_1d(&updated_flat, parameters.raw_dim())?;
993
994 self.step_count += 1;
995
996 Ok(updated_params)
997 }
998
999 pub fn meta_learning_step(&mut self, tasks: &[MetaTask<T>]) -> Result<T> {
1001 let meta_loss = self.meta_learner.step(tasks, &mut self.lstm_network)?;
1003
1004 self.metrics.meta_learning_loss = meta_loss.to_f64().unwrap_or(0.0);
1006
1007 Ok(meta_loss)
1008 }
1009
1010 pub fn transfer_to_domain(
1012 &mut self,
1013 target_tasks: &[MetaTask<T>],
1014 ) -> Result<TransferResults<T>> {
1015 self.meta_learner
1016 .transfer_learner
1017 .transfer_to_domain(target_tasks, &mut self.lstm_network)
1018 }
1019
1020 pub fn get_metrics(&self) -> &LSTMOptimizerMetrics {
1022 &self.metrics
1023 }
1024
1025 pub fn get_state_analysis(&self) -> OptimizationStateAnalysis<T> {
1027 OptimizationStateAnalysis {
1028 current_phase: self.state_tracker.phase,
1029 convergence_indicators: self.state_tracker.convergence_indicators.clone(),
1030 gradient_analysis: self.state_tracker.gradient_analyzer.clone(),
1031 landscape_analysis: self.state_tracker.landscape_analyzer.clone(),
1032 stability_metrics: self.state_tracker.stability_metrics.clone(),
1033 }
1034 }
1035
1036 fn prepare_lstm_input(&self, gradients: &Array1<T>) -> Result<Array1<T>> {
1038 let mut features = Vec::new();
1039
1040 features.extend_from_slice(gradients.as_slice().unwrap());
1042
1043 if let Some(prev_gradients) = self.history_buffer.get_recent_gradients(5) {
1045 for prev_grad in prev_gradients {
1046 let grad_diff: Vec<T> = gradients
1048 .iter()
1049 .zip(prev_grad.iter())
1050 .map(|(&g1, &g2)| g1 - g2)
1051 .collect();
1052 features.extend(grad_diff);
1053 }
1054 }
1055
1056 let grad_norm = gradients.iter().map(|&g| g * g).sum::<T>().sqrt();
1058 let grad_mean = gradients.iter().cloned().sum::<T>() / T::from(gradients.len()).unwrap();
1059 let grad_std = {
1060 let variance = gradients
1061 .iter()
1062 .map(|&g| (g - grad_mean) * (g - grad_mean))
1063 .sum::<T>()
1064 / T::from(gradients.len()).unwrap();
1065 variance.sqrt()
1066 };
1067
1068 features.extend([grad_norm, grad_mean, grad_std]);
1069
1070 if let Some(loss_features) = self.history_buffer.get_loss_features() {
1072 features.extend(loss_features);
1073 }
1074
1075 features.resize(self.config.input_features, T::zero());
1077
1078 Ok(Array1::from_vec(features))
1079 }
1080
1081 fn generate_updates(
1083 &self,
1084 lstm_output: &Array1<T>,
1085 gradients: &Array1<T>,
1086 learning_rate: T,
1087 ) -> Result<Array1<T>> {
1088 let transformed_output = match self.lstm_network.output_projection.output_transform {
1090 OutputTransform::Identity => lstm_output.clone(),
1091 OutputTransform::Tanh => lstm_output.mapv(|x| x.tanh()),
1092 OutputTransform::ScaledTanh { scale } => {
1093 let scale_t =
1094 scirs2_core::numeric::NumCast::from(scale).unwrap_or_else(|| T::zero());
1095 lstm_output.mapv(|x| x.tanh() * scale_t)
1096 }
1097 OutputTransform::AdaptiveScale => {
1098 let grad_norm = gradients.iter().map(|&g| g * g).sum::<T>().sqrt();
1099 let adaptive_scale = T::one() / (T::one() + grad_norm);
1100 lstm_output.mapv(|x| x * adaptive_scale)
1101 }
1102 OutputTransform::LearnedNonlinear => {
1103 lstm_output.mapv(|x| {
1105 let exp_x = x.exp();
1106 (exp_x - (-x).exp()) / (exp_x + (-x).exp()) })
1108 }
1109 };
1110
1111 let updates = &transformed_output * learning_rate;
1113
1114 Ok(updates)
1115 }
1116
1117 fn update_metrics(&mut self, gradients: &Array1<T>, updates: &Array1<T>, lr: T) {
1119 let grad_norm = gradients.iter().map(|&g| g * g).sum::<T>().sqrt();
1121 let update_norm = updates.iter().map(|&u| u * u).sum::<T>().sqrt();
1122
1123 self.update_lstm_stats();
1125
1126 self.metrics.adaptation_efficiency = (update_norm / grad_norm).to_f64().unwrap_or(1.0);
1128
1129 self.metrics.computational_overhead = self.estimate_computational_overhead();
1131
1132 self.metrics.memory_usage_mb = self.estimate_memory_usage();
1134 }
1135
1136 fn update_lstm_stats(&mut self) {
1138 for layer in self.lstm_network.layers.iter() {
1140 let hidden_stats = self.compute_state_stats(&layer.hidden_state);
1141 let cell_stats = self.compute_state_stats(&layer.cell_state);
1142
1143 self.metrics.lstm_stats.hidden_state_stats = hidden_stats;
1145 self.metrics.lstm_stats.cell_state_stats = cell_stats;
1146 }
1147
1148 if let Some(ref attention) = self.lstm_network.attention {
1150 if let Some(ref attentionweights) = attention.attentionweights {
1151 self.metrics.attention_stats = Some(self.compute_attention_stats(attentionweights));
1152 }
1153 }
1154 }
1155
1156 fn compute_state_stats(&self, state: &Array1<T>) -> StateStatistics {
1158 let values: Vec<f64> = state.iter().map(|&x| x.to_f64().unwrap_or(0.0)).collect();
1159
1160 let mean = values.iter().sum::<f64>() / values.len() as f64;
1161 let variance =
1162 values.iter().map(|&x| (x - mean).powi(2)).sum::<f64>() / values.len() as f64;
1163 let std = variance.sqrt();
1164 let min = values.iter().cloned().fold(f64::INFINITY, f64::min);
1165 let max = values.iter().cloned().fold(f64::NEG_INFINITY, f64::max);
1166
1167 let saturation_count = values.iter().filter(|&&x| x.abs() > 0.95).count();
1168 let saturation_percent = saturation_count as f64 / values.len() as f64 * 100.0;
1169
1170 StateStatistics {
1171 mean,
1172 std,
1173 min,
1174 max,
1175 saturation_percent,
1176 }
1177 }
1178
1179 fn compute_attention_stats(&self, attentionweights: &Array2<T>) -> AttentionStats {
1181 let weights: Vec<f64> = attentionweights
1182 .iter()
1183 .map(|&w| w.to_f64().unwrap_or(0.0))
1184 .collect();
1185
1186 let entropy = weights
1188 .iter()
1189 .filter(|&&w| w > 0.0)
1190 .map(|&w| -w * w.ln())
1191 .sum::<f64>();
1192
1193 let concentration = 1.0 / (1.0 + entropy);
1195
1196 let head_diversity = weights.iter().map(|&w| w.abs()).sum::<f64>() / weights.len() as f64;
1198
1199 AttentionStats {
1200 attention_entropy: entropy,
1201 attention_concentration: concentration,
1202 head_diversity,
1203 temporal_patterns: vec![0.0; 10], }
1205 }
1206
1207 fn estimate_computational_overhead(&self) -> f64 {
1209 let lstm_overhead = self.config.num_layers as f64 * 0.1;
1211 let attention_overhead = if self.config.use_attention { 0.2 } else { 0.0 };
1212 let meta_learning_overhead = 0.1;
1213
1214 1.0 + lstm_overhead + attention_overhead + meta_learning_overhead
1215 }
1216
1217 fn estimate_memory_usage(&self) -> f64 {
1219 let parameter_memory =
1221 self.config.hidden_size as f64 * self.config.num_layers as f64 * 8.0 / 1024.0 / 1024.0;
1222 let history_memory =
1223 self.config.gradient_history_size as f64 * self.config.input_features as f64 * 8.0
1224 / 1024.0
1225 / 1024.0;
1226 let lstm_state_memory =
1227 self.config.hidden_size as f64 * self.config.num_layers as f64 * 2.0 * 8.0
1228 / 1024.0
1229 / 1024.0;
1230
1231 parameter_memory + history_memory + lstm_state_memory
1232 }
1233
1234 fn validate_config(config: &LearnedOptimizerConfig) -> Result<()> {
1236 if config.hidden_size == 0 {
1237 return Err(OptimError::InvalidConfig(
1238 "Hidden size must be positive".to_string(),
1239 ));
1240 }
1241
1242 if config.num_layers == 0 {
1243 return Err(OptimError::InvalidConfig(
1244 "Number of layers must be positive".to_string(),
1245 ));
1246 }
1247
1248 if config.input_features == 0 {
1249 return Err(OptimError::InvalidConfig(
1250 "Input features must be positive".to_string(),
1251 ));
1252 }
1253
1254 if config.meta_learning_rate <= 0.0 {
1255 return Err(OptimError::InvalidConfig(
1256 "Meta learning rate must be positive".to_string(),
1257 ));
1258 }
1259
1260 Ok(())
1261 }
1262
1263 fn flatten_to_1d<S, D>(&self, array: &ArrayBase<S, D>) -> Result<Array1<T>>
1265 where
1266 S: Data<Elem = T>,
1267 D: Dimension,
1268 {
1269 Ok(Array1::from_iter(array.iter().cloned()))
1270 }
1271
1272 fn reshape_from_1d<D>(&self, flat: &Array1<T>, shape: D) -> Result<Array<T, D>>
1273 where
1274 D: Dimension + Clone,
1275 {
1276 Array::from_shape_vec(shape, flat.to_vec())
1277 .map_err(|e| OptimError::InvalidConfig(format!("Reshape error: {}", e)))
1278 }
1279}
1280
1281impl<T: Float + Debug + Default + Clone + 'static + Send + Sync> LSTMNetwork<T> {
1284 fn new(config: &LearnedOptimizerConfig) -> Result<Self> {
1286 let mut layers = Vec::new();
1287
1288 for i in 0..config.num_layers {
1290 let input_size = if i == 0 {
1291 config.input_features
1292 } else {
1293 config.hidden_size
1294 };
1295 let layer = LSTMLayer::new(input_size, config.hidden_size)?;
1296 layers.push(layer);
1297 }
1298
1299 let output_projection = OutputProjection::new(
1301 config.hidden_size,
1302 config.output_features,
1303 OutputTransform::ScaledTanh { scale: 0.1 },
1304 )?;
1305
1306 let attention = if config.use_attention {
1308 Some(AttentionMechanism::new(config)?)
1309 } else {
1310 None
1311 };
1312
1313 let layer_norms = (0..config.num_layers)
1315 .map(|_| LayerNormalization::new(config.hidden_size))
1316 .collect::<Result<Vec<_>>>()?;
1317
1318 Ok(Self {
1319 layers,
1320 output_projection,
1321 attention,
1322 layer_norms,
1323 dropout_rate: config.dropout_rate,
1324 })
1325 }
1326
1327 fn forward(&mut self, input: &Array1<T>) -> Result<Array1<T>> {
1329 let mut current_input = input.clone();
1330
1331 for i in 0..self.layers.len() {
1333 current_input = self.layers[i].forward(¤t_input)?;
1334
1335 current_input = self.layer_norms[i].forward(¤t_input)?;
1337
1338 if self.dropout_rate > 0.0 {
1340 current_input = self.apply_dropout(¤t_input)?;
1341 }
1342 }
1343
1344 if let Some(ref mut attention) = self.attention {
1346 current_input = attention.forward(¤t_input)?;
1347 }
1348
1349 let output = self.output_projection.forward(¤t_input)?;
1351
1352 Ok(output)
1353 }
1354
1355 fn apply_dropout(&self, input: &Array1<T>) -> Result<Array1<T>> {
1357 Ok(input.mapv(|x| {
1359 if T::from(scirs2_core::random::thread_rng().gen_range(0.0..1.0)).unwrap()
1360 < scirs2_core::numeric::NumCast::from(self.dropout_rate)
1361 .unwrap_or_else(|| T::zero())
1362 {
1363 T::zero()
1364 } else {
1365 x / scirs2_core::numeric::NumCast::from(1.0 - self.dropout_rate)
1366 .unwrap_or_else(|| T::zero())
1367 }
1368 }))
1369 }
1370}
1371
1372impl<T: Float + Debug + Default + Clone + 'static + Send + Sync> LSTMLayer<T> {
1373 fn new(_input_size: usize, hiddensize: usize) -> Result<Self> {
1375 let scale = (2.0 / (_input_size + hiddensize) as f64).sqrt();
1377
1378 Ok(Self {
1379 weight_ih: Self::xavier_init(4 * hiddensize, _input_size, scale),
1380 weight_hh: Self::xavier_init(4 * hiddensize, hiddensize, scale),
1381 bias_ih: Array1::zeros(4 * hiddensize),
1382 bias_hh: Array1::zeros(4 * hiddensize),
1383 hidden_state: Array1::zeros(hiddensize),
1384 cell_state: Array1::zeros(hiddensize),
1385 hiddensize,
1386 })
1387 }
1388
1389 fn forward(&mut self, input: &Array1<T>) -> Result<Array1<T>> {
1391 let ih_linear = self.weight_ih.dot(input) + &self.bias_ih;
1393 let hh_linear = self.weight_hh.dot(&self.hidden_state) + &self.bias_hh;
1394 let gates = ih_linear + hh_linear;
1395
1396 let input_gate = Self::sigmoid(&gates.slice(s![0..self.hiddensize]).to_owned());
1398 let forget_gate = Self::sigmoid(
1399 &gates
1400 .slice(s![self.hiddensize..2 * self.hiddensize])
1401 .to_owned(),
1402 );
1403 let cell_gate = Self::tanh(
1404 &gates
1405 .slice(s![2 * self.hiddensize..3 * self.hiddensize])
1406 .to_owned(),
1407 );
1408 let output_gate = Self::sigmoid(
1409 &gates
1410 .slice(s![3 * self.hiddensize..4 * self.hiddensize])
1411 .to_owned(),
1412 );
1413
1414 self.cell_state = &forget_gate * &self.cell_state + &input_gate * &cell_gate;
1416
1417 self.hidden_state = &output_gate * &Self::tanh(&self.cell_state);
1419
1420 Ok(self.hidden_state.clone())
1421 }
1422
1423 fn xavier_init(rows: usize, cols: usize, scale: f64) -> Array2<T> {
1425 Array2::from_shape_fn((rows, cols), |_| {
1426 let val = (scirs2_core::random::thread_rng().gen_range(0.0..1.0) - 0.5) * 2.0 * scale;
1427 scirs2_core::numeric::NumCast::from(val).unwrap_or_else(|| T::zero())
1428 })
1429 }
1430
1431 fn sigmoid(x: &Array1<T>) -> Array1<T> {
1433 x.mapv(|xi| T::one() / (T::one() + (-xi).exp()))
1434 }
1435
1436 fn tanh(x: &Array1<T>) -> Array1<T> {
1438 x.mapv(|xi| xi.tanh())
1439 }
1440}
1441
1442impl<T: Float + Debug + Send + Sync + 'static + Default + Clone> HistoryBuffer<T> {
1443 fn new(_maxlength: usize) -> Self {
1445 Self {
1446 gradients: VecDeque::with_capacity(_maxlength),
1447 parameters: VecDeque::with_capacity(_maxlength),
1448 losses: VecDeque::with_capacity(_maxlength),
1449 learning_rates: VecDeque::with_capacity(_maxlength),
1450 update_magnitudes: VecDeque::with_capacity(_maxlength),
1451 _maxlength,
1452 feature_cache: None,
1453 }
1454 }
1455
1456 fn update(&mut self, params: &Array1<T>, grads: &Array1<T>, loss: Option<T>) {
1458 self.parameters.push_back(params.clone());
1460 self.gradients.push_back(grads.clone());
1461
1462 if let Some(l) = loss {
1463 self.losses.push_back(l);
1464 }
1465
1466 while self.parameters.len() > self._maxlength {
1468 self.parameters.pop_front();
1469 }
1470 while self.gradients.len() > self._maxlength {
1471 self.gradients.pop_front();
1472 }
1473 while self.losses.len() > self._maxlength {
1474 self.losses.pop_front();
1475 }
1476
1477 self.feature_cache = None;
1479 }
1480
1481 fn get_recent_gradients(&self, count: usize) -> Option<Vec<&Array1<T>>> {
1483 if self.gradients.len() < count {
1484 return None;
1485 }
1486
1487 Some(self.gradients.iter().rev().take(count).collect())
1488 }
1489
1490 fn get_loss_features(&self) -> Option<Vec<T>> {
1492 if self.losses.len() < 2 {
1493 return None;
1494 }
1495
1496 let current_loss = *self.losses.back().unwrap();
1497 let prev_loss = self.losses[self.losses.len() - 2];
1498
1499 let loss_change = current_loss - prev_loss;
1500 let loss_ratio = if prev_loss.abs()
1501 > scirs2_core::numeric::NumCast::from(1e-8).unwrap_or_else(|| T::zero())
1502 {
1503 current_loss / prev_loss
1504 } else {
1505 T::one()
1506 };
1507
1508 Some(vec![loss_change, loss_ratio])
1509 }
1510}
1511
1512#[derive(Debug, Clone)]
1515pub struct OptimizationStateAnalysis<T: Float + Debug + Send + Sync + 'static> {
1516 pub current_phase: OptimizationPhase,
1517 pub convergence_indicators: ConvergenceIndicators<T>,
1518 pub gradient_analysis: GradientAnalyzer<T>,
1519 pub landscape_analysis: LossLandscapeAnalyzer<T>,
1520 pub stability_metrics: StabilityMetrics<T>,
1521}
1522
1523#[derive(Debug, Clone)]
1525pub struct TransferResults<T: Float + Debug + Send + Sync + 'static> {
1526 pub initial_performance: T,
1527 pub final_performance: T,
1528 pub adaptation_steps: usize,
1529 pub transfer_efficiency: T,
1530}
1531
1532impl Default for LSTMOptimizerMetrics {
1535 fn default() -> Self {
1536 Self::new()
1537 }
1538}
1539
1540impl LSTMOptimizerMetrics {
1541 fn new() -> Self {
1542 Self {
1543 meta_learning_loss: 0.0,
1544 avg_convergence_speed: 0.0,
1545 generalization_performance: 0.0,
1546 adaptation_efficiency: 0.0,
1547 transfer_success_rate: 0.0,
1548 memory_usage_mb: 0.0,
1549 computational_overhead: 1.0,
1550 lstm_stats: LSTMNetworkStats {
1551 gate_activations: GateActivationStats {
1552 input_gate: StateStatistics::default(),
1553 forget_gate: StateStatistics::default(),
1554 output_gate: StateStatistics::default(),
1555 cell_gate: StateStatistics::default(),
1556 },
1557 hidden_state_stats: StateStatistics::default(),
1558 cell_state_stats: StateStatistics::default(),
1559 gradient_flow_stats: GradientFlowStats {
1560 layer_gradient_norms: Vec::new(),
1561 layer_correlations: Vec::new(),
1562 vanishing_gradient_score: 0.0,
1563 exploding_gradient_score: 0.0,
1564 },
1565 },
1566 attention_stats: None,
1567 }
1568 }
1569}
1570
1571impl Default for StateStatistics {
1572 fn default() -> Self {
1573 Self {
1574 mean: 0.0,
1575 std: 0.0,
1576 min: 0.0,
1577 max: 0.0,
1578 saturation_percent: 0.0,
1579 }
1580 }
1581}
1582
1583impl<T: Float + Debug + Send + Sync + 'static + Default + Clone> MetaLearner<T> {
1587 fn new(config: &LearnedOptimizerConfig) -> Result<Self> {
1588 Ok(Self {
1590 strategy: MetaOptimizationStrategy::MAML,
1591 meta_parameters: HashMap::new(),
1592 meta_gradients: HashMap::new(),
1593 task_history: VecDeque::new(),
1594 meta_state: MetaLearningState {
1595 meta_step: 0,
1596 meta_lr: scirs2_core::numeric::NumCast::from(0.001).unwrap_or_else(|| T::zero()),
1597 adaptation_rate: scirs2_core::numeric::NumCast::from(0.1)
1598 .unwrap_or_else(|| T::zero()),
1599 meta_validation_performance: T::zero(),
1600 adaptation_history: VecDeque::new(),
1601 inner_loop_state: InnerLoopState {
1602 inner_step: 0,
1603 inner_parameters: Array1::zeros(1),
1604 inner_optimizer_state: HashMap::new(),
1605 inner_performance: T::zero(),
1606 },
1607 },
1608 transfer_learner: TransferLearner {
1609 source_knowledge: HashMap::new(),
1610 adaptation_parameters: Array1::zeros(1),
1611 transfer_metrics: TransferMetrics {
1612 efficiency: T::zero(),
1613 adaptation_speed: T::zero(),
1614 knowledge_retention: T::zero(),
1615 negative_transfer_score: T::zero(),
1616 },
1617 similarity_estimator: DomainSimilarityEstimator {
1618 domain_embeddings: HashMap::new(),
1619 similarity_params: Array1::zeros(1),
1620 similarity_function: SimilarityFunction::Cosine,
1621 },
1622 },
1623 })
1624 }
1625
1626 fn step(&mut self, tasks: &[MetaTask<T>], network: &mut LSTMNetwork<T>) -> Result<T> {
1627 Ok(T::zero())
1629 }
1630}
1631
1632impl<T: Float + Debug + Send + Sync + 'static + Default + Clone> TransferLearner<T> {
1633 fn transfer_to_domain(
1634 &mut self,
1635 _target_tasks: &[MetaTask<T>],
1636 _network: &mut LSTMNetwork<T>,
1637 ) -> Result<TransferResults<T>> {
1638 Ok(TransferResults {
1640 initial_performance: T::zero(),
1641 final_performance: T::zero(),
1642 adaptation_steps: 0,
1643 transfer_efficiency: T::zero(),
1644 })
1645 }
1646}
1647
1648impl<T: Float + Debug + Send + Sync + 'static + Default + Clone> AdaptiveLearningRateController<T> {
1649 fn new(config: &LearnedOptimizerConfig) -> Result<Self> {
1650 Ok(Self {
1652 base_lr: scirs2_core::numeric::NumCast::from(0.001).unwrap_or_else(|| T::zero()),
1653 current_lr: scirs2_core::numeric::NumCast::from(0.001).unwrap_or_else(|| T::zero()),
1654 adaptation_params: LRAdaptationParams {
1655 momentum: scirs2_core::numeric::NumCast::from(0.9).unwrap_or_else(|| T::zero()),
1656 gradient_sensitivity: scirs2_core::numeric::NumCast::from(0.1)
1657 .unwrap_or_else(|| T::zero()),
1658 loss_sensitivity: scirs2_core::numeric::NumCast::from(0.1)
1659 .unwrap_or_else(|| T::zero()),
1660 min_lr: scirs2_core::numeric::NumCast::from(1e-6).unwrap_or_else(|| T::zero()),
1661 max_lr: scirs2_core::numeric::NumCast::from(0.1).unwrap_or_else(|| T::zero()),
1662 adaptation_rate: scirs2_core::numeric::NumCast::from(0.01)
1663 .unwrap_or_else(|| T::zero()),
1664 },
1665 lr_history: VecDeque::new(),
1666 performance_tracker: PerformanceTracker {
1667 recent_losses: VecDeque::new(),
1668 trend: PerformanceTrend::Unknown,
1669 stagnation_counter: 0,
1670 best_performance: T::zero(),
1671 improvement_rate: T::zero(),
1672 },
1673 schedule_params: None,
1674 })
1675 }
1676
1677 fn compute_lr(
1678 &mut self,
1679 gradients: &Array1<T>,
1680 _loss: Option<T>,
1681 _history: &HistoryBuffer<T>,
1682 ) -> Result<T> {
1683 Ok(self.current_lr)
1685 }
1686}
1687
1688impl<T: Float + Debug + Send + Sync + 'static + Default + Clone> OptimizationStateTracker<T> {
1689 fn new() -> Self {
1690 Self {
1691 phase: OptimizationPhase::InitialDescent,
1692 convergence_indicators: ConvergenceIndicators {
1693 gradient_norm_trend: Vec::new(),
1694 loss_change_trend: Vec::new(),
1695 parameter_change_magnitude: T::zero(),
1696 convergence_probability: T::zero(),
1697 estimated_steps_to_convergence: None,
1698 },
1699 gradient_analyzer: GradientAnalyzer {
1700 gradient_stats: GradientStatistics {
1701 mean_norm: T::zero(),
1702 norm_variance: T::zero(),
1703 direction_consistency: T::zero(),
1704 magnitude_distribution: Vec::new(),
1705 component_stats: Array1::zeros(1),
1706 },
1707 correlation_tracker: GradientCorrelationTracker {
1708 correlation_matrix: Array2::zeros((1, 1)),
1709 temporal_correlations: VecDeque::new(),
1710 cross_correlations: HashMap::new(),
1711 },
1712 noise_estimator: GradientNoiseEstimator {
1713 noise_level: T::zero(),
1714 signal_to_noise_ratio: T::zero(),
1715 noise_characteristics: NoiseCharacteristics {
1716 noise_type: NoiseType::White,
1717 scale: T::zero(),
1718 temporal_correlation: T::zero(),
1719 spatial_correlation: T::zero(),
1720 },
1721 },
1722 flow_analyzer: GradientFlowAnalyzer {
1723 flow_field: Array2::zeros((1, 1)),
1724 critical_points: Vec::new(),
1725 stability: FlowStability::Unknown,
1726 attractors: Vec::new(),
1727 repellers: Vec::new(),
1728 },
1729 },
1730 landscape_analyzer: LossLandscapeAnalyzer {
1731 local_curvature: T::zero(),
1732 hessian_eigenvalues: None,
1733 roughness: T::zero(),
1734 basin_size: T::zero(),
1735 barrier_heights: Vec::new(),
1736 },
1737 stability_metrics: StabilityMetrics {
1738 lyapunov_exponents: Array1::zeros(1),
1739 stability_margin: T::zero(),
1740 perturbation_sensitivity: T::zero(),
1741 robustness_score: T::zero(),
1742 },
1743 }
1744 }
1745
1746 fn update(&mut self, gradients: &Array1<T>, _updates: &Array1<T>, loss: Option<T>) {
1747 }
1749}
1750
1751#[cfg(test)]
1754mod tests {
1755 use super::*;
1756
1757 #[test]
1758 fn test_lstm_optimizer_creation() {
1759 let config = LearnedOptimizerConfig::default();
1760 let optimizer = LSTMOptimizer::<f64>::new(config);
1761 assert!(optimizer.is_ok());
1762 }
1763
1764 #[test]
1765 fn test_lstm_layer_creation() {
1766 let layer = LSTMLayer::<f64>::new(10, 20);
1767 assert!(layer.is_ok());
1768
1769 let layer = layer.unwrap();
1770 assert_eq!(layer.hiddensize, 20);
1771 assert_eq!(layer.weight_ih.shape(), &[80, 10]); assert_eq!(layer.weight_hh.shape(), &[80, 20]); }
1774
1775 #[test]
1776 fn test_history_buffer() {
1777 let mut buffer = HistoryBuffer::<f64>::new(5);
1778
1779 let params = Array1::from_vec(vec![1.0, 2.0, 3.0]);
1780 let grads = Array1::from_vec(vec![0.1, 0.2, 0.3]);
1781
1782 buffer.update(¶ms, &grads, Some(0.5));
1783
1784 assert_eq!(buffer.gradients.len(), 1);
1785 assert_eq!(buffer.parameters.len(), 1);
1786 assert_eq!(buffer.losses.len(), 1);
1787 }
1788
1789 #[test]
1790 fn test_config_validation() {
1791 let mut config = LearnedOptimizerConfig::default();
1792 assert!(LSTMOptimizer::<f64>::validate_config(&config).is_ok());
1793
1794 config.hidden_size = 0;
1795 assert!(LSTMOptimizer::<f64>::validate_config(&config).is_err());
1796 }
1797
1798 #[test]
1799 fn test_lstm_network_creation() {
1800 let config = LearnedOptimizerConfig::default();
1801 let network = LSTMNetwork::<f64>::new(&config);
1802 assert!(network.is_ok());
1803
1804 let network = network.unwrap();
1805 assert_eq!(network.layers.len(), config.num_layers);
1806 assert!(network.attention.is_some()); }
1808
1809 #[test]
1810 fn test_metrics_initialization() {
1811 let metrics = LSTMOptimizerMetrics::new();
1812 assert_eq!(metrics.meta_learning_loss, 0.0);
1813 assert_eq!(metrics.computational_overhead, 1.0);
1814 assert!(metrics.attention_stats.is_none());
1815 }
1816}