1#[allow(dead_code)]
8use scirs2_core::ndarray::{s, Array, Array1, Array2, ArrayBase, Data, Dimension};
9use scirs2_core::numeric::Float;
10use scirs2_core::random::Rng;
11use std::collections::{HashMap, VecDeque};
12use std::fmt::Debug;
13
14use super::{LearnedOptimizerConfig, MetaOptimizationStrategy};
15use crate::error::{OptimError, Result};
16
17#[derive(Debug)]
19pub struct LSTMOptimizer<T: Float + Debug + Send + Sync + 'static> {
20 config: LearnedOptimizerConfig,
22
23 lstm_network: LSTMNetwork<T>,
25
26 history_buffer: HistoryBuffer<T>,
28
29 meta_learner: MetaLearner<T>,
31
32 lr_controller: AdaptiveLearningRateController<T>,
34
35 state_tracker: OptimizationStateTracker<T>,
37
38 metrics: LSTMOptimizerMetrics,
40
41 step_count: usize,
43
44 rng: scirs2_core::random::CoreRandom,
46}
47
48#[derive(Debug, Clone)]
50pub struct LSTMNetwork<T: Float + Debug + Send + Sync + 'static> {
51 layers: Vec<LSTMLayer<T>>,
53
54 output_projection: OutputProjection<T>,
56
57 attention: Option<AttentionMechanism<T>>,
59
60 layer_norms: Vec<LayerNormalization<T>>,
62
63 dropout_rate: f64,
65}
66
67#[derive(Debug, Clone)]
69pub struct LSTMLayer<T: Float + Debug + Send + Sync + 'static> {
70 weight_ih: Array2<T>,
72
73 weight_hh: Array2<T>,
75
76 bias_ih: Array1<T>,
78
79 bias_hh: Array1<T>,
81
82 hidden_state: Array1<T>,
84
85 cell_state: Array1<T>,
87
88 hiddensize: usize,
90}
91
92#[derive(Debug, Clone)]
94pub struct OutputProjection<T: Float + Debug + Send + Sync + 'static> {
95 weights: Array2<T>,
97
98 bias: Array1<T>,
100
101 output_transform: OutputTransform,
103}
104
105impl<T: Float + Debug + Send + Sync + 'static + Default + Clone> OutputProjection<T> {
106 pub fn new(
108 input_size: usize,
109 output_size: usize,
110 output_transform: OutputTransform,
111 ) -> Result<Self> {
112 let weights = Array2::zeros((output_size, input_size));
113 let bias = Array1::zeros(output_size);
114
115 Ok(Self {
116 weights,
117 bias,
118 output_transform,
119 })
120 }
121
122 pub fn forward(&self, input: &Array1<T>) -> Result<Array1<T>> {
124 Ok(input.clone())
126 }
127}
128
129#[derive(Debug, Clone, Copy)]
131pub enum OutputTransform {
132 Identity,
134
135 Tanh,
137
138 ScaledTanh { scale: f64 },
140
141 AdaptiveScale,
143
144 LearnedNonlinear,
146}
147
148#[derive(Debug, Clone)]
150pub struct AttentionMechanism<T: Float + Debug + Send + Sync + 'static> {
151 query_proj: Array2<T>,
153
154 key_proj: Array2<T>,
156
157 value_proj: Array2<T>,
159
160 output_proj: Array2<T>,
162
163 num_heads: usize,
165
166 head_size: usize,
168
169 attentionweights: Option<Array2<T>>,
171}
172
173impl<T: Float + Debug + Send + Sync + 'static + Default + Clone> AttentionMechanism<T> {
174 pub fn new(config: &LearnedOptimizerConfig) -> Result<Self> {
176 let hiddensize = config.hidden_size;
177 let num_heads = config.attention_heads;
178 let head_size = hiddensize / num_heads;
179
180 Ok(Self {
181 query_proj: Array2::zeros((hiddensize, hiddensize)),
182 key_proj: Array2::zeros((hiddensize, hiddensize)),
183 value_proj: Array2::zeros((hiddensize, hiddensize)),
184 output_proj: Array2::zeros((hiddensize, hiddensize)),
185 num_heads,
186 head_size,
187 attentionweights: None,
188 })
189 }
190
191 pub fn forward(&mut self, input: &Array1<T>) -> Result<Array1<T>> {
193 Ok(input.clone())
195 }
196}
197
198#[derive(Debug, Clone)]
200pub struct LayerNormalization<T: Float + Debug + Send + Sync + 'static> {
201 gamma: Array1<T>,
203
204 beta: Array1<T>,
206
207 epsilon: T,
209}
210
211impl<T: Float + Debug + Send + Sync + 'static + Default + Clone> LayerNormalization<T> {
212 pub fn new(features: usize) -> Result<Self> {
214 Ok(Self {
215 gamma: Array1::ones(features),
216 beta: Array1::zeros(features),
217 epsilon: scirs2_core::numeric::NumCast::from(1e-5).unwrap_or_else(|| T::zero()),
218 })
219 }
220
221 pub fn forward(&self, input: &Array1<T>) -> Result<Array1<T>> {
223 Ok(input.clone())
225 }
226}
227
228#[derive(Debug, Clone)]
230pub struct HistoryBuffer<T: Float + Debug + Send + Sync + 'static> {
231 gradients: VecDeque<Array1<T>>,
233
234 parameters: VecDeque<Array1<T>>,
236
237 losses: VecDeque<T>,
239
240 learning_rates: VecDeque<T>,
242
243 update_magnitudes: VecDeque<T>,
245
246 _maxlength: usize,
248
249 feature_cache: Option<Array2<T>>,
251}
252
253#[derive(Debug, Clone)]
255pub struct MetaLearner<T: Float + Debug + Send + Sync + 'static> {
256 strategy: MetaOptimizationStrategy,
258
259 meta_parameters: HashMap<String, Array1<T>>,
261
262 meta_gradients: HashMap<String, Array1<T>>,
264
265 task_history: VecDeque<MetaTask<T>>,
267
268 meta_state: MetaLearningState<T>,
270
271 transfer_learner: TransferLearner<T>,
273}
274
275#[derive(Debug, Clone)]
277pub struct MetaTask<T: Float + Debug + Send + Sync + 'static> {
278 pub id: String,
280
281 pub task_type: TaskType,
283
284 pub training_trajectory: Vec<TrajectoryPoint<T>>,
286
287 pub final_performance: T,
289
290 pub characteristics: TaskCharacteristics<T>,
292
293 pub weight: T,
295}
296
297#[derive(Debug, Clone, Copy)]
299pub enum TaskType {
300 SupervisedLearning,
302
303 ReinforcementLearning,
305
306 UnsupervisedLearning,
308
309 FewShotLearning,
311
312 OnlineLearning,
314
315 AdversarialTraining,
317}
318
319#[derive(Debug, Clone)]
321pub struct TrajectoryPoint<T: Float + Debug + Send + Sync + 'static> {
322 pub step: usize,
324
325 pub gradient: Array1<T>,
327
328 pub parameters: Array1<T>,
330
331 pub loss: T,
333
334 pub learning_rate: T,
336
337 pub update: Array1<T>,
339}
340
341#[derive(Debug, Clone)]
343pub struct TaskCharacteristics<T: Float + Debug + Send + Sync + 'static> {
344 pub dimensionality: usize,
346
347 pub curvature: T,
349
350 pub noise_level: T,
352
353 pub conditioning: T,
355
356 pub difficulty: T,
358
359 pub domain_features: Array1<T>,
361}
362
363#[derive(Debug, Clone)]
365pub struct MetaLearningState<T: Float + Debug + Send + Sync + 'static> {
366 pub meta_step: usize,
368
369 pub meta_lr: T,
371
372 pub adaptation_rate: T,
374
375 pub meta_validation_performance: T,
377
378 pub adaptation_history: VecDeque<AdaptationEvent<T>>,
380
381 pub inner_loop_state: InnerLoopState<T>,
383}
384
385#[derive(Debug, Clone)]
387pub struct AdaptationEvent<T: Float + Debug + Send + Sync + 'static> {
388 pub source_task: String,
390
391 pub target_task: String,
393
394 pub adaptation_steps: usize,
396
397 pub transfer_efficiency: T,
399
400 pub performance_improvement: T,
402}
403
404#[derive(Debug, Clone)]
406pub struct InnerLoopState<T: Float + Debug + Send + Sync + 'static> {
407 pub inner_step: usize,
409
410 pub inner_parameters: Array1<T>,
412
413 pub inner_optimizer_state: HashMap<String, Array1<T>>,
415
416 pub inner_performance: T,
418}
419
420#[derive(Debug, Clone)]
422pub struct TransferLearner<T: Float + Debug + Send + Sync + 'static> {
423 pub source_knowledge: HashMap<String, Array1<T>>,
425
426 pub adaptation_parameters: Array1<T>,
428
429 pub transfer_metrics: TransferMetrics<T>,
431
432 pub similarity_estimator: DomainSimilarityEstimator<T>,
434}
435
436#[derive(Debug, Clone)]
438pub struct TransferMetrics<T: Float + Debug + Send + Sync + 'static> {
439 pub efficiency: T,
441
442 pub adaptation_speed: T,
444
445 pub knowledge_retention: T,
447
448 pub negative_transfer_score: T,
450}
451
452#[derive(Debug, Clone)]
454pub struct DomainSimilarityEstimator<T: Float + Debug + Send + Sync + 'static> {
455 pub domain_embeddings: HashMap<String, Array1<T>>,
457
458 pub similarity_params: Array1<T>,
460
461 pub similarity_function: SimilarityFunction,
463}
464
465#[derive(Debug, Clone, Copy)]
467pub enum SimilarityFunction {
468 Cosine,
470
471 Euclidean,
473
474 LearnedMetric,
476
477 TaskSpecific,
479}
480
481#[derive(Debug, Clone)]
483pub struct AdaptiveLearningRateController<T: Float + Debug + Send + Sync + 'static> {
484 base_lr: T,
486
487 current_lr: T,
489
490 adaptation_params: LRAdaptationParams<T>,
492
493 lr_history: VecDeque<T>,
495
496 performance_tracker: PerformanceTracker<T>,
498
499 schedule_params: Option<Array1<T>>,
501}
502
503#[derive(Debug, Clone)]
505pub struct LRAdaptationParams<T: Float + Debug + Send + Sync + 'static> {
506 pub momentum: T,
508
509 pub gradient_sensitivity: T,
511
512 pub loss_sensitivity: T,
514
515 pub min_lr: T,
517
518 pub max_lr: T,
520
521 pub adaptation_rate: T,
523}
524
525#[derive(Debug, Clone)]
527pub struct PerformanceTracker<T: Float + Debug + Send + Sync + 'static> {
528 recent_losses: VecDeque<T>,
530
531 trend: PerformanceTrend,
533
534 stagnation_counter: usize,
536
537 best_performance: T,
539
540 improvement_rate: T,
542}
543
544#[derive(Debug, Clone, Copy)]
546pub enum PerformanceTrend {
547 Improving,
549
550 Stagnating,
552
553 Degrading,
555
556 Oscillating,
558
559 Unknown,
561}
562
563#[derive(Debug, Clone)]
565pub struct OptimizationStateTracker<T: Float + Debug + Send + Sync + 'static> {
566 phase: OptimizationPhase,
568
569 convergence_indicators: ConvergenceIndicators<T>,
571
572 gradient_analyzer: GradientAnalyzer<T>,
574
575 landscape_analyzer: LossLandscapeAnalyzer<T>,
577
578 stability_metrics: StabilityMetrics<T>,
580}
581
582#[derive(Debug, Clone, Copy)]
584pub enum OptimizationPhase {
585 InitialDescent,
587
588 SteadyProgress,
590
591 FineTuning,
593
594 Converged,
596
597 Plateau,
599
600 Diverging,
602}
603
604#[derive(Debug, Clone)]
606pub struct ConvergenceIndicators<T: Float + Debug + Send + Sync + 'static> {
607 pub gradient_norm_trend: Vec<T>,
609
610 pub loss_change_trend: Vec<T>,
612
613 pub parameter_change_magnitude: T,
615
616 pub convergence_probability: T,
618
619 pub estimated_steps_to_convergence: Option<usize>,
621}
622
623#[derive(Debug, Clone)]
625pub struct GradientAnalyzer<T: Float + Debug + Send + Sync + 'static> {
626 pub gradient_stats: GradientStatistics<T>,
628
629 pub correlation_tracker: GradientCorrelationTracker<T>,
631
632 pub noise_estimator: GradientNoiseEstimator<T>,
634
635 pub flow_analyzer: GradientFlowAnalyzer<T>,
637}
638
639#[derive(Debug, Clone)]
641pub struct GradientStatistics<T: Float + Debug + Send + Sync + 'static> {
642 pub mean_norm: T,
644
645 pub norm_variance: T,
647
648 pub direction_consistency: T,
650
651 pub magnitude_distribution: Vec<T>,
653
654 pub component_stats: Array1<T>,
656}
657
658#[derive(Debug, Clone)]
660pub struct GradientCorrelationTracker<T: Float + Debug + Send + Sync + 'static> {
661 pub correlation_matrix: Array2<T>,
663
664 pub temporal_correlations: VecDeque<T>,
666
667 pub cross_correlations: HashMap<String, T>,
669}
670
671#[derive(Debug, Clone)]
673pub struct GradientNoiseEstimator<T: Float + Debug + Send + Sync + 'static> {
674 pub noise_level: T,
676
677 pub signal_to_noise_ratio: T,
679
680 pub noise_characteristics: NoiseCharacteristics<T>,
682}
683
684#[derive(Debug, Clone)]
686pub struct NoiseCharacteristics<T: Float + Debug + Send + Sync + 'static> {
687 pub noise_type: NoiseType,
689
690 pub scale: T,
692
693 pub temporal_correlation: T,
695
696 pub spatial_correlation: T,
698}
699
700#[derive(Debug, Clone, Copy)]
702pub enum NoiseType {
703 White,
705
706 Colored,
708
709 Structured,
711
712 Adaptive,
714}
715
716#[derive(Debug, Clone)]
718pub struct GradientFlowAnalyzer<T: Float + Debug + Send + Sync + 'static> {
719 pub flow_field: Array2<T>,
721
722 pub critical_points: Vec<Array1<T>>,
724
725 pub stability: FlowStability,
727
728 pub attractors: Vec<Array1<T>>,
730 pub repellers: Vec<Array1<T>>,
731}
732
733#[derive(Debug, Clone, Copy)]
735pub enum FlowStability {
736 Stable,
738
739 Unstable,
741
742 Chaotic,
744
745 Unknown,
747}
748
749#[derive(Debug, Clone)]
751pub struct LossLandscapeAnalyzer<T: Float + Debug + Send + Sync + 'static> {
752 pub local_curvature: T,
754
755 pub hessian_eigenvalues: Option<Array1<T>>,
757
758 pub roughness: T,
760
761 pub basin_size: T,
763
764 pub barrier_heights: Vec<T>,
766}
767
768#[derive(Debug, Clone)]
770pub struct StabilityMetrics<T: Float + Debug + Send + Sync + 'static> {
771 pub lyapunov_exponents: Array1<T>,
773
774 pub stability_margin: T,
776
777 pub perturbation_sensitivity: T,
779
780 pub robustness_score: T,
782}
783
784#[derive(Debug, Clone)]
786pub struct LSTMOptimizerMetrics {
787 pub meta_learning_loss: f64,
789
790 pub avg_convergence_speed: f64,
792
793 pub generalization_performance: f64,
795
796 pub adaptation_efficiency: f64,
798
799 pub transfer_success_rate: f64,
801
802 pub memory_usage_mb: f64,
804
805 pub computational_overhead: f64,
807
808 pub lstm_stats: LSTMNetworkStats,
810
811 pub attention_stats: Option<AttentionStats>,
813}
814
815#[derive(Debug, Clone)]
817pub struct LSTMNetworkStats {
818 pub gate_activations: GateActivationStats,
820
821 pub hidden_state_stats: StateStatistics,
823
824 pub cell_state_stats: StateStatistics,
826
827 pub gradient_flow_stats: GradientFlowStats,
829}
830
831#[derive(Debug, Clone)]
833pub struct GateActivationStats {
834 pub input_gate: StateStatistics,
836
837 pub forget_gate: StateStatistics,
839
840 pub output_gate: StateStatistics,
842
843 pub cell_gate: StateStatistics,
845}
846
847#[derive(Debug, Clone)]
849pub struct StateStatistics {
850 pub mean: f64,
852
853 pub std: f64,
855
856 pub min: f64,
858
859 pub max: f64,
861
862 pub saturation_percent: f64,
864}
865
866#[derive(Debug, Clone)]
868pub struct GradientFlowStats {
869 pub layer_gradient_norms: Vec<f64>,
871
872 pub layer_correlations: Vec<f64>,
874
875 pub vanishing_gradient_score: f64,
877
878 pub exploding_gradient_score: f64,
880}
881
882#[derive(Debug, Clone)]
884pub struct AttentionStats {
885 pub attention_entropy: f64,
887
888 pub attention_concentration: f64,
890
891 pub head_diversity: f64,
893
894 pub temporal_patterns: Vec<f64>,
896}
897
898impl<
899 T: Float
900 + Default
901 + Clone
902 + Send
903 + Sync
904 + std::iter::Sum
905 + for<'a> std::iter::Sum<&'a T>
906 + scirs2_core::ndarray::ScalarOperand
907 + std::fmt::Debug,
908 > LSTMOptimizer<T>
909{
910 pub fn new(config: LearnedOptimizerConfig) -> Result<Self> {
912 Self::validate_config(&config)?;
914
915 let lstm_network = LSTMNetwork::new(&config)?;
917
918 let history_buffer = HistoryBuffer::new(config.gradient_history_size);
920
921 let meta_learner = MetaLearner::new(&config)?;
923
924 let lr_controller = AdaptiveLearningRateController::new(&config)?;
926
927 let state_tracker = OptimizationStateTracker::new();
929
930 let metrics = LSTMOptimizerMetrics::new();
932
933 let rng = scirs2_core::random::thread_rng();
935
936 Ok(Self {
937 config,
938 lstm_network,
939 history_buffer,
940 meta_learner,
941 lr_controller,
942 state_tracker,
943 metrics,
944 step_count: 0,
945 rng,
946 })
947 }
948
949 pub fn lstm_step<S, D>(
951 &mut self,
952 parameters: &ArrayBase<S, D>,
953 gradients: &ArrayBase<S, D>,
954 loss: Option<T>,
955 ) -> Result<Array<T, D>>
956 where
957 S: Data<Elem = T>,
958 D: Dimension + Clone,
959 {
960 let flat_params = self.flatten_to_1d(parameters)?;
962 let flat_gradients = self.flatten_to_1d(gradients)?;
963
964 self.history_buffer
966 .update(&flat_params, &flat_gradients, loss);
967
968 let lstm_input = self.prepare_lstm_input(&flat_gradients)?;
970
971 let lstm_output = self.lstm_network.forward(&lstm_input)?;
973
974 let learning_rate =
976 self.lr_controller
977 .compute_lr(&flat_gradients, loss, &self.history_buffer)?;
978
979 let updates = self.generate_updates(&lstm_output, &flat_gradients, learning_rate)?;
981
982 let updated_flat = &flat_params - &updates;
984
985 self.state_tracker.update(&flat_gradients, &updates, loss);
987
988 self.update_metrics(&flat_gradients, &updates, learning_rate);
990
991 let updated_params = self.reshape_from_1d(&updated_flat, parameters.raw_dim())?;
993
994 self.step_count += 1;
995
996 Ok(updated_params)
997 }
998
999 pub fn meta_learning_step(&mut self, tasks: &[MetaTask<T>]) -> Result<T> {
1001 let meta_loss = self.meta_learner.step(tasks, &mut self.lstm_network)?;
1003
1004 self.metrics.meta_learning_loss = meta_loss.to_f64().unwrap_or(0.0);
1006
1007 Ok(meta_loss)
1008 }
1009
1010 pub fn transfer_to_domain(
1012 &mut self,
1013 target_tasks: &[MetaTask<T>],
1014 ) -> Result<TransferResults<T>> {
1015 self.meta_learner
1016 .transfer_learner
1017 .transfer_to_domain(target_tasks, &mut self.lstm_network)
1018 }
1019
1020 pub fn get_metrics(&self) -> &LSTMOptimizerMetrics {
1022 &self.metrics
1023 }
1024
1025 pub fn get_state_analysis(&self) -> OptimizationStateAnalysis<T> {
1027 OptimizationStateAnalysis {
1028 current_phase: self.state_tracker.phase,
1029 convergence_indicators: self.state_tracker.convergence_indicators.clone(),
1030 gradient_analysis: self.state_tracker.gradient_analyzer.clone(),
1031 landscape_analysis: self.state_tracker.landscape_analyzer.clone(),
1032 stability_metrics: self.state_tracker.stability_metrics.clone(),
1033 }
1034 }
1035
1036 fn prepare_lstm_input(&self, gradients: &Array1<T>) -> Result<Array1<T>> {
1038 let mut features = Vec::new();
1039
1040 features.extend_from_slice(gradients.as_slice().expect("unwrap failed"));
1042
1043 if let Some(prev_gradients) = self.history_buffer.get_recent_gradients(5) {
1045 for prev_grad in prev_gradients {
1046 let grad_diff: Vec<T> = gradients
1048 .iter()
1049 .zip(prev_grad.iter())
1050 .map(|(&g1, &g2)| g1 - g2)
1051 .collect();
1052 features.extend(grad_diff);
1053 }
1054 }
1055
1056 let grad_norm = gradients.iter().map(|&g| g * g).sum::<T>().sqrt();
1058 let grad_mean =
1059 gradients.iter().cloned().sum::<T>() / T::from(gradients.len()).expect("unwrap failed");
1060 let grad_std = {
1061 let variance = gradients
1062 .iter()
1063 .map(|&g| (g - grad_mean) * (g - grad_mean))
1064 .sum::<T>()
1065 / T::from(gradients.len()).expect("unwrap failed");
1066 variance.sqrt()
1067 };
1068
1069 features.extend([grad_norm, grad_mean, grad_std]);
1070
1071 if let Some(loss_features) = self.history_buffer.get_loss_features() {
1073 features.extend(loss_features);
1074 }
1075
1076 features.resize(self.config.input_features, T::zero());
1078
1079 Ok(Array1::from_vec(features))
1080 }
1081
1082 fn generate_updates(
1084 &self,
1085 lstm_output: &Array1<T>,
1086 gradients: &Array1<T>,
1087 learning_rate: T,
1088 ) -> Result<Array1<T>> {
1089 let transformed_output = match self.lstm_network.output_projection.output_transform {
1091 OutputTransform::Identity => lstm_output.clone(),
1092 OutputTransform::Tanh => lstm_output.mapv(|x| x.tanh()),
1093 OutputTransform::ScaledTanh { scale } => {
1094 let scale_t =
1095 scirs2_core::numeric::NumCast::from(scale).unwrap_or_else(|| T::zero());
1096 lstm_output.mapv(|x| x.tanh() * scale_t)
1097 }
1098 OutputTransform::AdaptiveScale => {
1099 let grad_norm = gradients.iter().map(|&g| g * g).sum::<T>().sqrt();
1100 let adaptive_scale = T::one() / (T::one() + grad_norm);
1101 lstm_output.mapv(|x| x * adaptive_scale)
1102 }
1103 OutputTransform::LearnedNonlinear => {
1104 lstm_output.mapv(|x| {
1106 let exp_x = x.exp();
1107 (exp_x - (-x).exp()) / (exp_x + (-x).exp()) })
1109 }
1110 };
1111
1112 let updates = &transformed_output * learning_rate;
1114
1115 Ok(updates)
1116 }
1117
1118 fn update_metrics(&mut self, gradients: &Array1<T>, updates: &Array1<T>, lr: T) {
1120 let grad_norm = gradients.iter().map(|&g| g * g).sum::<T>().sqrt();
1122 let update_norm = updates.iter().map(|&u| u * u).sum::<T>().sqrt();
1123
1124 self.update_lstm_stats();
1126
1127 self.metrics.adaptation_efficiency = (update_norm / grad_norm).to_f64().unwrap_or(1.0);
1129
1130 self.metrics.computational_overhead = self.estimate_computational_overhead();
1132
1133 self.metrics.memory_usage_mb = self.estimate_memory_usage();
1135 }
1136
1137 fn update_lstm_stats(&mut self) {
1139 for layer in self.lstm_network.layers.iter() {
1141 let hidden_stats = self.compute_state_stats(&layer.hidden_state);
1142 let cell_stats = self.compute_state_stats(&layer.cell_state);
1143
1144 self.metrics.lstm_stats.hidden_state_stats = hidden_stats;
1146 self.metrics.lstm_stats.cell_state_stats = cell_stats;
1147 }
1148
1149 if let Some(ref attention) = self.lstm_network.attention {
1151 if let Some(ref attentionweights) = attention.attentionweights {
1152 self.metrics.attention_stats = Some(self.compute_attention_stats(attentionweights));
1153 }
1154 }
1155 }
1156
1157 fn compute_state_stats(&self, state: &Array1<T>) -> StateStatistics {
1159 let values: Vec<f64> = state.iter().map(|&x| x.to_f64().unwrap_or(0.0)).collect();
1160
1161 let mean = values.iter().sum::<f64>() / values.len() as f64;
1162 let variance =
1163 values.iter().map(|&x| (x - mean).powi(2)).sum::<f64>() / values.len() as f64;
1164 let std = variance.sqrt();
1165 let min = values.iter().cloned().fold(f64::INFINITY, f64::min);
1166 let max = values.iter().cloned().fold(f64::NEG_INFINITY, f64::max);
1167
1168 let saturation_count = values.iter().filter(|&&x| x.abs() > 0.95).count();
1169 let saturation_percent = saturation_count as f64 / values.len() as f64 * 100.0;
1170
1171 StateStatistics {
1172 mean,
1173 std,
1174 min,
1175 max,
1176 saturation_percent,
1177 }
1178 }
1179
1180 fn compute_attention_stats(&self, attentionweights: &Array2<T>) -> AttentionStats {
1182 let weights: Vec<f64> = attentionweights
1183 .iter()
1184 .map(|&w| w.to_f64().unwrap_or(0.0))
1185 .collect();
1186
1187 let entropy = weights
1189 .iter()
1190 .filter(|&&w| w > 0.0)
1191 .map(|&w| -w * w.ln())
1192 .sum::<f64>();
1193
1194 let concentration = 1.0 / (1.0 + entropy);
1196
1197 let head_diversity = weights.iter().map(|&w| w.abs()).sum::<f64>() / weights.len() as f64;
1199
1200 AttentionStats {
1201 attention_entropy: entropy,
1202 attention_concentration: concentration,
1203 head_diversity,
1204 temporal_patterns: vec![0.0; 10], }
1206 }
1207
1208 fn estimate_computational_overhead(&self) -> f64 {
1210 let lstm_overhead = self.config.num_layers as f64 * 0.1;
1212 let attention_overhead = if self.config.use_attention { 0.2 } else { 0.0 };
1213 let meta_learning_overhead = 0.1;
1214
1215 1.0 + lstm_overhead + attention_overhead + meta_learning_overhead
1216 }
1217
1218 fn estimate_memory_usage(&self) -> f64 {
1220 let parameter_memory =
1222 self.config.hidden_size as f64 * self.config.num_layers as f64 * 8.0 / 1024.0 / 1024.0;
1223 let history_memory =
1224 self.config.gradient_history_size as f64 * self.config.input_features as f64 * 8.0
1225 / 1024.0
1226 / 1024.0;
1227 let lstm_state_memory =
1228 self.config.hidden_size as f64 * self.config.num_layers as f64 * 2.0 * 8.0
1229 / 1024.0
1230 / 1024.0;
1231
1232 parameter_memory + history_memory + lstm_state_memory
1233 }
1234
1235 fn validate_config(config: &LearnedOptimizerConfig) -> Result<()> {
1237 if config.hidden_size == 0 {
1238 return Err(OptimError::InvalidConfig(
1239 "Hidden size must be positive".to_string(),
1240 ));
1241 }
1242
1243 if config.num_layers == 0 {
1244 return Err(OptimError::InvalidConfig(
1245 "Number of layers must be positive".to_string(),
1246 ));
1247 }
1248
1249 if config.input_features == 0 {
1250 return Err(OptimError::InvalidConfig(
1251 "Input features must be positive".to_string(),
1252 ));
1253 }
1254
1255 if config.meta_learning_rate <= 0.0 {
1256 return Err(OptimError::InvalidConfig(
1257 "Meta learning rate must be positive".to_string(),
1258 ));
1259 }
1260
1261 Ok(())
1262 }
1263
1264 fn flatten_to_1d<S, D>(&self, array: &ArrayBase<S, D>) -> Result<Array1<T>>
1266 where
1267 S: Data<Elem = T>,
1268 D: Dimension,
1269 {
1270 Ok(Array1::from_iter(array.iter().cloned()))
1271 }
1272
1273 fn reshape_from_1d<D>(&self, flat: &Array1<T>, shape: D) -> Result<Array<T, D>>
1274 where
1275 D: Dimension + Clone,
1276 {
1277 Array::from_shape_vec(shape, flat.to_vec())
1278 .map_err(|e| OptimError::InvalidConfig(format!("Reshape error: {}", e)))
1279 }
1280}
1281
1282impl<T: Float + Debug + Default + Clone + 'static + Send + Sync> LSTMNetwork<T> {
1285 fn new(config: &LearnedOptimizerConfig) -> Result<Self> {
1287 let mut layers = Vec::new();
1288
1289 for i in 0..config.num_layers {
1291 let input_size = if i == 0 {
1292 config.input_features
1293 } else {
1294 config.hidden_size
1295 };
1296 let layer = LSTMLayer::new(input_size, config.hidden_size)?;
1297 layers.push(layer);
1298 }
1299
1300 let output_projection = OutputProjection::new(
1302 config.hidden_size,
1303 config.output_features,
1304 OutputTransform::ScaledTanh { scale: 0.1 },
1305 )?;
1306
1307 let attention = if config.use_attention {
1309 Some(AttentionMechanism::new(config)?)
1310 } else {
1311 None
1312 };
1313
1314 let layer_norms = (0..config.num_layers)
1316 .map(|_| LayerNormalization::new(config.hidden_size))
1317 .collect::<Result<Vec<_>>>()?;
1318
1319 Ok(Self {
1320 layers,
1321 output_projection,
1322 attention,
1323 layer_norms,
1324 dropout_rate: config.dropout_rate,
1325 })
1326 }
1327
1328 fn forward(&mut self, input: &Array1<T>) -> Result<Array1<T>> {
1330 let mut current_input = input.clone();
1331
1332 for i in 0..self.layers.len() {
1334 current_input = self.layers[i].forward(¤t_input)?;
1335
1336 current_input = self.layer_norms[i].forward(¤t_input)?;
1338
1339 if self.dropout_rate > 0.0 {
1341 current_input = self.apply_dropout(¤t_input)?;
1342 }
1343 }
1344
1345 if let Some(ref mut attention) = self.attention {
1347 current_input = attention.forward(¤t_input)?;
1348 }
1349
1350 let output = self.output_projection.forward(¤t_input)?;
1352
1353 Ok(output)
1354 }
1355
1356 fn apply_dropout(&self, input: &Array1<T>) -> Result<Array1<T>> {
1358 Ok(input.mapv(|x| {
1360 if T::from(scirs2_core::random::thread_rng().gen_range(0.0..1.0))
1361 .expect("unwrap failed")
1362 < scirs2_core::numeric::NumCast::from(self.dropout_rate)
1363 .unwrap_or_else(|| T::zero())
1364 {
1365 T::zero()
1366 } else {
1367 x / scirs2_core::numeric::NumCast::from(1.0 - self.dropout_rate)
1368 .unwrap_or_else(|| T::zero())
1369 }
1370 }))
1371 }
1372}
1373
1374impl<T: Float + Debug + Default + Clone + 'static + Send + Sync> LSTMLayer<T> {
1375 fn new(_input_size: usize, hiddensize: usize) -> Result<Self> {
1377 let scale = (2.0 / (_input_size + hiddensize) as f64).sqrt();
1379
1380 Ok(Self {
1381 weight_ih: Self::xavier_init(4 * hiddensize, _input_size, scale),
1382 weight_hh: Self::xavier_init(4 * hiddensize, hiddensize, scale),
1383 bias_ih: Array1::zeros(4 * hiddensize),
1384 bias_hh: Array1::zeros(4 * hiddensize),
1385 hidden_state: Array1::zeros(hiddensize),
1386 cell_state: Array1::zeros(hiddensize),
1387 hiddensize,
1388 })
1389 }
1390
1391 fn forward(&mut self, input: &Array1<T>) -> Result<Array1<T>> {
1393 let ih_linear = self.weight_ih.dot(input) + &self.bias_ih;
1395 let hh_linear = self.weight_hh.dot(&self.hidden_state) + &self.bias_hh;
1396 let gates = ih_linear + hh_linear;
1397
1398 let input_gate = Self::sigmoid(&gates.slice(s![0..self.hiddensize]).to_owned());
1400 let forget_gate = Self::sigmoid(
1401 &gates
1402 .slice(s![self.hiddensize..2 * self.hiddensize])
1403 .to_owned(),
1404 );
1405 let cell_gate = Self::tanh(
1406 &gates
1407 .slice(s![2 * self.hiddensize..3 * self.hiddensize])
1408 .to_owned(),
1409 );
1410 let output_gate = Self::sigmoid(
1411 &gates
1412 .slice(s![3 * self.hiddensize..4 * self.hiddensize])
1413 .to_owned(),
1414 );
1415
1416 self.cell_state = &forget_gate * &self.cell_state + &input_gate * &cell_gate;
1418
1419 self.hidden_state = &output_gate * &Self::tanh(&self.cell_state);
1421
1422 Ok(self.hidden_state.clone())
1423 }
1424
1425 fn xavier_init(rows: usize, cols: usize, scale: f64) -> Array2<T> {
1427 Array2::from_shape_fn((rows, cols), |_| {
1428 let val = (scirs2_core::random::thread_rng().gen_range(0.0..1.0) - 0.5) * 2.0 * scale;
1429 scirs2_core::numeric::NumCast::from(val).unwrap_or_else(|| T::zero())
1430 })
1431 }
1432
1433 fn sigmoid(x: &Array1<T>) -> Array1<T> {
1435 x.mapv(|xi| T::one() / (T::one() + (-xi).exp()))
1436 }
1437
1438 fn tanh(x: &Array1<T>) -> Array1<T> {
1440 x.mapv(|xi| xi.tanh())
1441 }
1442}
1443
1444impl<T: Float + Debug + Send + Sync + 'static + Default + Clone> HistoryBuffer<T> {
1445 fn new(_maxlength: usize) -> Self {
1447 Self {
1448 gradients: VecDeque::with_capacity(_maxlength),
1449 parameters: VecDeque::with_capacity(_maxlength),
1450 losses: VecDeque::with_capacity(_maxlength),
1451 learning_rates: VecDeque::with_capacity(_maxlength),
1452 update_magnitudes: VecDeque::with_capacity(_maxlength),
1453 _maxlength,
1454 feature_cache: None,
1455 }
1456 }
1457
1458 fn update(&mut self, params: &Array1<T>, grads: &Array1<T>, loss: Option<T>) {
1460 self.parameters.push_back(params.clone());
1462 self.gradients.push_back(grads.clone());
1463
1464 if let Some(l) = loss {
1465 self.losses.push_back(l);
1466 }
1467
1468 while self.parameters.len() > self._maxlength {
1470 self.parameters.pop_front();
1471 }
1472 while self.gradients.len() > self._maxlength {
1473 self.gradients.pop_front();
1474 }
1475 while self.losses.len() > self._maxlength {
1476 self.losses.pop_front();
1477 }
1478
1479 self.feature_cache = None;
1481 }
1482
1483 fn get_recent_gradients(&self, count: usize) -> Option<Vec<&Array1<T>>> {
1485 if self.gradients.len() < count {
1486 return None;
1487 }
1488
1489 Some(self.gradients.iter().rev().take(count).collect())
1490 }
1491
1492 fn get_loss_features(&self) -> Option<Vec<T>> {
1494 if self.losses.len() < 2 {
1495 return None;
1496 }
1497
1498 let current_loss = *self.losses.back().expect("unwrap failed");
1499 let prev_loss = self.losses[self.losses.len() - 2];
1500
1501 let loss_change = current_loss - prev_loss;
1502 let loss_ratio = if prev_loss.abs()
1503 > scirs2_core::numeric::NumCast::from(1e-8).unwrap_or_else(|| T::zero())
1504 {
1505 current_loss / prev_loss
1506 } else {
1507 T::one()
1508 };
1509
1510 Some(vec![loss_change, loss_ratio])
1511 }
1512}
1513
1514#[derive(Debug, Clone)]
1517pub struct OptimizationStateAnalysis<T: Float + Debug + Send + Sync + 'static> {
1518 pub current_phase: OptimizationPhase,
1519 pub convergence_indicators: ConvergenceIndicators<T>,
1520 pub gradient_analysis: GradientAnalyzer<T>,
1521 pub landscape_analysis: LossLandscapeAnalyzer<T>,
1522 pub stability_metrics: StabilityMetrics<T>,
1523}
1524
1525#[derive(Debug, Clone)]
1527pub struct TransferResults<T: Float + Debug + Send + Sync + 'static> {
1528 pub initial_performance: T,
1529 pub final_performance: T,
1530 pub adaptation_steps: usize,
1531 pub transfer_efficiency: T,
1532}
1533
1534impl Default for LSTMOptimizerMetrics {
1537 fn default() -> Self {
1538 Self::new()
1539 }
1540}
1541
1542impl LSTMOptimizerMetrics {
1543 fn new() -> Self {
1544 Self {
1545 meta_learning_loss: 0.0,
1546 avg_convergence_speed: 0.0,
1547 generalization_performance: 0.0,
1548 adaptation_efficiency: 0.0,
1549 transfer_success_rate: 0.0,
1550 memory_usage_mb: 0.0,
1551 computational_overhead: 1.0,
1552 lstm_stats: LSTMNetworkStats {
1553 gate_activations: GateActivationStats {
1554 input_gate: StateStatistics::default(),
1555 forget_gate: StateStatistics::default(),
1556 output_gate: StateStatistics::default(),
1557 cell_gate: StateStatistics::default(),
1558 },
1559 hidden_state_stats: StateStatistics::default(),
1560 cell_state_stats: StateStatistics::default(),
1561 gradient_flow_stats: GradientFlowStats {
1562 layer_gradient_norms: Vec::new(),
1563 layer_correlations: Vec::new(),
1564 vanishing_gradient_score: 0.0,
1565 exploding_gradient_score: 0.0,
1566 },
1567 },
1568 attention_stats: None,
1569 }
1570 }
1571}
1572
1573impl Default for StateStatistics {
1574 fn default() -> Self {
1575 Self {
1576 mean: 0.0,
1577 std: 0.0,
1578 min: 0.0,
1579 max: 0.0,
1580 saturation_percent: 0.0,
1581 }
1582 }
1583}
1584
1585impl<T: Float + Debug + Send + Sync + 'static + Default + Clone> MetaLearner<T> {
1589 fn new(config: &LearnedOptimizerConfig) -> Result<Self> {
1590 Ok(Self {
1592 strategy: MetaOptimizationStrategy::MAML,
1593 meta_parameters: HashMap::new(),
1594 meta_gradients: HashMap::new(),
1595 task_history: VecDeque::new(),
1596 meta_state: MetaLearningState {
1597 meta_step: 0,
1598 meta_lr: scirs2_core::numeric::NumCast::from(0.001).unwrap_or_else(|| T::zero()),
1599 adaptation_rate: scirs2_core::numeric::NumCast::from(0.1)
1600 .unwrap_or_else(|| T::zero()),
1601 meta_validation_performance: T::zero(),
1602 adaptation_history: VecDeque::new(),
1603 inner_loop_state: InnerLoopState {
1604 inner_step: 0,
1605 inner_parameters: Array1::zeros(1),
1606 inner_optimizer_state: HashMap::new(),
1607 inner_performance: T::zero(),
1608 },
1609 },
1610 transfer_learner: TransferLearner {
1611 source_knowledge: HashMap::new(),
1612 adaptation_parameters: Array1::zeros(1),
1613 transfer_metrics: TransferMetrics {
1614 efficiency: T::zero(),
1615 adaptation_speed: T::zero(),
1616 knowledge_retention: T::zero(),
1617 negative_transfer_score: T::zero(),
1618 },
1619 similarity_estimator: DomainSimilarityEstimator {
1620 domain_embeddings: HashMap::new(),
1621 similarity_params: Array1::zeros(1),
1622 similarity_function: SimilarityFunction::Cosine,
1623 },
1624 },
1625 })
1626 }
1627
1628 fn step(&mut self, tasks: &[MetaTask<T>], network: &mut LSTMNetwork<T>) -> Result<T> {
1629 Ok(T::zero())
1631 }
1632}
1633
1634impl<T: Float + Debug + Send + Sync + 'static + Default + Clone> TransferLearner<T> {
1635 fn transfer_to_domain(
1636 &mut self,
1637 _target_tasks: &[MetaTask<T>],
1638 _network: &mut LSTMNetwork<T>,
1639 ) -> Result<TransferResults<T>> {
1640 Ok(TransferResults {
1642 initial_performance: T::zero(),
1643 final_performance: T::zero(),
1644 adaptation_steps: 0,
1645 transfer_efficiency: T::zero(),
1646 })
1647 }
1648}
1649
1650impl<T: Float + Debug + Send + Sync + 'static + Default + Clone> AdaptiveLearningRateController<T> {
1651 fn new(config: &LearnedOptimizerConfig) -> Result<Self> {
1652 Ok(Self {
1654 base_lr: scirs2_core::numeric::NumCast::from(0.001).unwrap_or_else(|| T::zero()),
1655 current_lr: scirs2_core::numeric::NumCast::from(0.001).unwrap_or_else(|| T::zero()),
1656 adaptation_params: LRAdaptationParams {
1657 momentum: scirs2_core::numeric::NumCast::from(0.9).unwrap_or_else(|| T::zero()),
1658 gradient_sensitivity: scirs2_core::numeric::NumCast::from(0.1)
1659 .unwrap_or_else(|| T::zero()),
1660 loss_sensitivity: scirs2_core::numeric::NumCast::from(0.1)
1661 .unwrap_or_else(|| T::zero()),
1662 min_lr: scirs2_core::numeric::NumCast::from(1e-6).unwrap_or_else(|| T::zero()),
1663 max_lr: scirs2_core::numeric::NumCast::from(0.1).unwrap_or_else(|| T::zero()),
1664 adaptation_rate: scirs2_core::numeric::NumCast::from(0.01)
1665 .unwrap_or_else(|| T::zero()),
1666 },
1667 lr_history: VecDeque::new(),
1668 performance_tracker: PerformanceTracker {
1669 recent_losses: VecDeque::new(),
1670 trend: PerformanceTrend::Unknown,
1671 stagnation_counter: 0,
1672 best_performance: T::zero(),
1673 improvement_rate: T::zero(),
1674 },
1675 schedule_params: None,
1676 })
1677 }
1678
1679 fn compute_lr(
1680 &mut self,
1681 gradients: &Array1<T>,
1682 _loss: Option<T>,
1683 _history: &HistoryBuffer<T>,
1684 ) -> Result<T> {
1685 Ok(self.current_lr)
1687 }
1688}
1689
1690impl<T: Float + Debug + Send + Sync + 'static + Default + Clone> OptimizationStateTracker<T> {
1691 fn new() -> Self {
1692 Self {
1693 phase: OptimizationPhase::InitialDescent,
1694 convergence_indicators: ConvergenceIndicators {
1695 gradient_norm_trend: Vec::new(),
1696 loss_change_trend: Vec::new(),
1697 parameter_change_magnitude: T::zero(),
1698 convergence_probability: T::zero(),
1699 estimated_steps_to_convergence: None,
1700 },
1701 gradient_analyzer: GradientAnalyzer {
1702 gradient_stats: GradientStatistics {
1703 mean_norm: T::zero(),
1704 norm_variance: T::zero(),
1705 direction_consistency: T::zero(),
1706 magnitude_distribution: Vec::new(),
1707 component_stats: Array1::zeros(1),
1708 },
1709 correlation_tracker: GradientCorrelationTracker {
1710 correlation_matrix: Array2::zeros((1, 1)),
1711 temporal_correlations: VecDeque::new(),
1712 cross_correlations: HashMap::new(),
1713 },
1714 noise_estimator: GradientNoiseEstimator {
1715 noise_level: T::zero(),
1716 signal_to_noise_ratio: T::zero(),
1717 noise_characteristics: NoiseCharacteristics {
1718 noise_type: NoiseType::White,
1719 scale: T::zero(),
1720 temporal_correlation: T::zero(),
1721 spatial_correlation: T::zero(),
1722 },
1723 },
1724 flow_analyzer: GradientFlowAnalyzer {
1725 flow_field: Array2::zeros((1, 1)),
1726 critical_points: Vec::new(),
1727 stability: FlowStability::Unknown,
1728 attractors: Vec::new(),
1729 repellers: Vec::new(),
1730 },
1731 },
1732 landscape_analyzer: LossLandscapeAnalyzer {
1733 local_curvature: T::zero(),
1734 hessian_eigenvalues: None,
1735 roughness: T::zero(),
1736 basin_size: T::zero(),
1737 barrier_heights: Vec::new(),
1738 },
1739 stability_metrics: StabilityMetrics {
1740 lyapunov_exponents: Array1::zeros(1),
1741 stability_margin: T::zero(),
1742 perturbation_sensitivity: T::zero(),
1743 robustness_score: T::zero(),
1744 },
1745 }
1746 }
1747
1748 fn update(&mut self, gradients: &Array1<T>, _updates: &Array1<T>, loss: Option<T>) {
1749 }
1751}
1752
1753#[cfg(test)]
1756mod tests {
1757 use super::*;
1758
1759 #[test]
1760 fn test_lstm_optimizer_creation() {
1761 let config = LearnedOptimizerConfig::default();
1762 let optimizer = LSTMOptimizer::<f64>::new(config);
1763 assert!(optimizer.is_ok());
1764 }
1765
1766 #[test]
1767 fn test_lstm_layer_creation() {
1768 let layer = LSTMLayer::<f64>::new(10, 20);
1769 assert!(layer.is_ok());
1770
1771 let layer = layer.expect("unwrap failed");
1772 assert_eq!(layer.hiddensize, 20);
1773 assert_eq!(layer.weight_ih.shape(), &[80, 10]); assert_eq!(layer.weight_hh.shape(), &[80, 20]); }
1776
1777 #[test]
1778 fn test_history_buffer() {
1779 let mut buffer = HistoryBuffer::<f64>::new(5);
1780
1781 let params = Array1::from_vec(vec![1.0, 2.0, 3.0]);
1782 let grads = Array1::from_vec(vec![0.1, 0.2, 0.3]);
1783
1784 buffer.update(¶ms, &grads, Some(0.5));
1785
1786 assert_eq!(buffer.gradients.len(), 1);
1787 assert_eq!(buffer.parameters.len(), 1);
1788 assert_eq!(buffer.losses.len(), 1);
1789 }
1790
1791 #[test]
1792 fn test_config_validation() {
1793 let mut config = LearnedOptimizerConfig::default();
1794 assert!(LSTMOptimizer::<f64>::validate_config(&config).is_ok());
1795
1796 config.hidden_size = 0;
1797 assert!(LSTMOptimizer::<f64>::validate_config(&config).is_err());
1798 }
1799
1800 #[test]
1801 fn test_lstm_network_creation() {
1802 let config = LearnedOptimizerConfig::default();
1803 let network = LSTMNetwork::<f64>::new(&config);
1804 assert!(network.is_ok());
1805
1806 let network = network.expect("unwrap failed");
1807 assert_eq!(network.layers.len(), config.num_layers);
1808 assert!(network.attention.is_some()); }
1810
1811 #[test]
1812 fn test_metrics_initialization() {
1813 let metrics = LSTMOptimizerMetrics::new();
1814 assert_eq!(metrics.meta_learning_loss, 0.0);
1815 assert_eq!(metrics.computational_overhead, 1.0);
1816 assert!(metrics.attention_stats.is_none());
1817 }
1818}