1use super::{
11 utils, Experience, ImprovementReward, OptimizationAction, OptimizationState,
12 RLOptimizationConfig, RLOptimizer, RewardFunction,
13};
14use crate::error::{OptimizeError, OptimizeResult};
15use crate::result::OptimizeResults;
16use scirs2_core::ndarray::{Array1, Array2, Array3, ArrayView1};
17use scirs2_core::random::{rng, Rng};
20use std::collections::{HashMap, VecDeque};
21
22#[derive(Debug, Clone)]
24pub struct MetaPolicyNetwork {
25 pub policy_weights: Array3<f64>, pub meta_weights: Array3<f64>,
29 pub policy_bias: Array2<f64>, pub meta_bias: Array2<f64>,
32 pub layer_sizes: Vec<usize>,
34 pub adaptive_learning_rates: Array2<f64>,
36 pub meta_gradient_accumulator: Array3<f64>,
38 pub second_order_info: Array3<f64>,
40 pub curriculum_difficulty: f64,
42 pub problem_embeddings: HashMap<String, Array1<f64>>,
44}
45
46impl MetaPolicyNetwork {
47 pub fn new(_input_size: usize, output_size: usize, hidden_sizes: Vec<usize>) -> Self {
49 let mut layer_sizes = vec![_input_size];
50 layer_sizes.extend(hidden_sizes);
51 layer_sizes.push(output_size);
52
53 let num_layers = layer_sizes.len() - 1;
54 let max_layer_size = *layer_sizes.iter().max().expect("Operation failed");
55
56 let mut policy_weights = Array3::zeros((num_layers, max_layer_size, max_layer_size));
58 let mut meta_weights = Array3::zeros((num_layers, max_layer_size, max_layer_size));
59
60 for layer in 0..num_layers {
61 let fan_in = layer_sizes[layer];
62 let fan_out = layer_sizes[layer + 1];
63 let xavier_std = (2.0 / (fan_in + fan_out) as f64).sqrt();
64
65 for i in 0..fan_out {
66 for j in 0..fan_in {
67 policy_weights[[layer, i, j]] =
68 scirs2_core::random::rng().random_range(-0.5..0.5) * 2.0 * xavier_std;
69 meta_weights[[layer, i, j]] =
70 scirs2_core::random::rng().random_range(-0.5..0.5) * 2.0 * xavier_std * 0.1;
71 }
72 }
73 }
74
75 Self {
76 policy_weights,
77 meta_weights,
78 policy_bias: Array2::zeros((num_layers, max_layer_size)),
79 meta_bias: Array2::zeros((num_layers, max_layer_size)),
80 layer_sizes,
81 adaptive_learning_rates: Array2::from_elem((num_layers, max_layer_size), 0.01),
82 meta_gradient_accumulator: Array3::zeros((num_layers, max_layer_size, max_layer_size)),
83 second_order_info: Array3::zeros((num_layers, max_layer_size, max_layer_size)),
84 curriculum_difficulty: 0.1,
85 problem_embeddings: HashMap::new(),
86 }
87 }
88
89 pub fn meta_forward(
91 &mut self,
92 state_features: &ArrayView1<f64>,
93 problem_class: &str,
94 meta_context: &Array1<f64>,
95 ) -> (Array1<f64>, Array1<f64>) {
96 let problem_embedding =
98 self.get_or_create_problem_embedding(problem_class, state_features.len());
99
100 let mut augmented_input = state_features.to_owned();
102
103 for (i, &emb) in problem_embedding.iter().enumerate() {
105 if i < augmented_input.len() {
106 augmented_input[i] += emb * 0.1;
107 }
108 }
109
110 let policy_output = self.forward_policy(&augmented_input.view());
112
113 let meta_output = self.forward_meta(&augmented_input.view(), meta_context);
115
116 (policy_output, meta_output)
117 }
118
119 fn forward_policy(&self, input: &ArrayView1<f64>) -> Array1<f64> {
120 let mut current_input = input.to_owned();
121
122 for layer in 0..(self.layer_sizes.len() - 1) {
123 let layer_input_size = self.layer_sizes[layer];
124 let layer_output_size = self.layer_sizes[layer + 1];
125
126 let mut layer_output = Array1::<f64>::zeros(layer_output_size);
127
128 for i in 0..layer_output_size {
129 for j in 0..layer_input_size.min(current_input.len()) {
130 layer_output[i] += self.policy_weights[[layer, i, j]] * current_input[j];
131 }
132 layer_output[i] += self.policy_bias[[layer, i]];
133
134 layer_output[i] = if layer_output[i] > 0.0 {
136 layer_output[i]
137 } else {
138 layer_output[i].exp() - 1.0
139 };
140 }
141
142 current_input = layer_output;
143 }
144
145 current_input
146 }
147
148 fn forward_meta(&self, input: &ArrayView1<f64>, metacontext: &Array1<f64>) -> Array1<f64> {
149 let mut meta_input = input.to_owned();
151 for (i, &ctx) in metacontext.iter().enumerate() {
152 if i < meta_input.len() {
153 meta_input[i] += ctx * 0.05;
154 }
155 }
156
157 let mut current_input = meta_input;
158
159 for layer in 0..(self.layer_sizes.len() - 1) {
160 let layer_input_size = self.layer_sizes[layer];
161 let layer_output_size = self.layer_sizes[layer + 1];
162
163 let mut layer_output = Array1::<f64>::zeros(layer_output_size);
164
165 for i in 0..layer_output_size {
166 for j in 0..layer_input_size.min(current_input.len()) {
167 layer_output[i] += self.meta_weights[[layer, i, j]] * current_input[j];
168 }
169 layer_output[i] += self.meta_bias[[layer, i]];
170
171 layer_output[i] = 1.0 / (1.0 + (-layer_output[i]).exp());
173 }
174
175 current_input = layer_output;
176 }
177
178 current_input
179 }
180
181 fn get_or_create_problem_embedding(
182 &mut self,
183 problem_class: &str,
184 input_size: usize,
185 ) -> Array1<f64> {
186 if let Some(embedding) = self.problem_embeddings.get(problem_class) {
187 embedding.clone()
188 } else {
189 let embedding = Array1::from_shape_fn(input_size, |_| {
190 scirs2_core::random::rng().random_range(-0.05..0.05)
191 });
192 self.problem_embeddings
193 .insert(problem_class.to_string(), embedding.clone());
194 embedding
195 }
196 }
197
198 pub fn meta_update(
200 &mut self,
201 meta_gradients: &MetaGradients,
202 base_learning_rate: f64,
203 meta_learning_rate: f64,
204 ) {
205 for layer in 0..(self.layer_sizes.len() - 1) {
207 for i in 0..self.layer_sizes[layer + 1] {
208 for j in 0..self.layer_sizes[layer] {
209 let meta_grad = meta_gradients.meta_lr_gradients[[layer, i, j]];
211 self.adaptive_learning_rates[[layer, i]] *=
212 (1.0 + meta_learning_rate * meta_grad).max(0.1).min(10.0);
213
214 let adaptive_lr = self.adaptive_learning_rates[[layer, i]] * base_learning_rate;
216 self.policy_weights[[layer, i, j]] +=
217 adaptive_lr * meta_gradients.policy_gradients[[layer, i, j]];
218
219 self.meta_weights[[layer, i, j]] +=
221 meta_learning_rate * meta_gradients.meta_weight_gradients[[layer, i, j]];
222 }
223
224 let adaptive_lr = self.adaptive_learning_rates[[layer, i]] * base_learning_rate;
226 self.policy_bias[[layer, i]] +=
227 adaptive_lr * meta_gradients.policy_bias_gradients[[layer, i]];
228 self.meta_bias[[layer, i]] +=
229 meta_learning_rate * meta_gradients.meta_bias_gradients[[layer, i]];
230 }
231 }
232
233 self.update_curriculum_difficulty(meta_gradients);
235 }
236
237 fn update_curriculum_difficulty(&mut self, metagradients: &MetaGradients) {
238 let gradient_norm = metagradients
239 .policy_gradients
240 .iter()
241 .map(|&g| g * g)
242 .sum::<f64>()
243 .sqrt();
244
245 if gradient_norm < 0.1 {
246 self.curriculum_difficulty = (self.curriculum_difficulty * 1.05).min(1.0);
247 } else if gradient_norm > 1.0 {
248 self.curriculum_difficulty = (self.curriculum_difficulty * 0.95).max(0.01);
249 }
250 }
251}
252
253#[derive(Debug, Clone)]
255pub struct MetaGradients {
256 pub policy_gradients: Array3<f64>,
258 pub meta_weight_gradients: Array3<f64>,
260 pub meta_lr_gradients: Array3<f64>,
262 pub policy_bias_gradients: Array2<f64>,
264 pub meta_bias_gradients: Array2<f64>,
265 pub second_order_terms: Array3<f64>,
267}
268
269#[derive(Debug, Clone)]
271pub struct AdvancedAdvancedPolicyGradientOptimizer {
272 config: RLOptimizationConfig,
274 meta_policy: MetaPolicyNetwork,
276 reward_function: ImprovementReward,
278 meta_trajectories: VecDeque<MetaTrajectory>,
280 problem_class_history: VecDeque<String>,
282 best_params: Array1<f64>,
284 best_objective: f64,
285 meta_stats: MetaLearningStats,
287 curriculum_controller: CurriculumController,
289 meta_experience_buffer: MetaExperienceBuffer,
291}
292
293#[derive(Debug, Clone)]
295pub struct MetaTrajectory {
296 pub experiences: Vec<Experience>,
298 pub problem_class: String,
300 pub initial_meta_context: Array1<f64>,
302 pub learning_metrics: LearningMetrics,
304 pub adaptation_speed: f64,
306}
307
308#[derive(Debug, Clone)]
310pub struct LearningMetrics {
311 pub improvement_rate: f64,
313 pub convergence_speed: f64,
315 pub exploration_efficiency: f64,
317 pub generalization_score: f64,
319}
320
321#[derive(Debug, Clone)]
323pub struct MetaLearningStats {
324 pub avg_learning_rates: Array1<f64>,
326 pub meta_gradient_norms: VecDeque<f64>,
328 pub problem_class_performance: HashMap<String, f64>,
330 pub curriculum_progress: f64,
332 pub adaptation_efficiency: f64,
334}
335
336#[derive(Debug, Clone)]
338pub struct CurriculumController {
339 pub difficulty_level: f64,
341 pub advancement_thresholds: Vec<f64>,
343 pub difficulty_generators: HashMap<String, f64>,
345 pub progress_tracker: VecDeque<f64>,
347}
348
349impl Default for CurriculumController {
350 fn default() -> Self {
351 Self::new()
352 }
353}
354
355impl CurriculumController {
356 pub fn new() -> Self {
357 Self {
358 difficulty_level: 0.1,
359 advancement_thresholds: vec![0.8, 0.85, 0.9, 0.95],
360 difficulty_generators: HashMap::new(),
361 progress_tracker: VecDeque::with_capacity(100),
362 }
363 }
364
365 pub fn should_advance(&self) -> bool {
366 if self.progress_tracker.len() < 20 {
367 return false;
368 }
369
370 let recent_performance: f64 =
371 self.progress_tracker.iter().rev().take(20).sum::<f64>() / 20.0;
372
373 let threshold_idx = ((self.difficulty_level * 4.0) as usize).min(3);
374 recent_performance > self.advancement_thresholds[threshold_idx]
375 }
376
377 pub fn advance_difficulty(&mut self) {
378 self.difficulty_level = (self.difficulty_level * 1.2).min(1.0);
379 }
380
381 pub fn update_progress(&mut self, performance: f64) {
382 self.progress_tracker.push_back(performance);
383 if self.progress_tracker.len() > 100 {
384 self.progress_tracker.pop_front();
385 }
386
387 if self.should_advance() {
388 self.advance_difficulty();
389 }
390 }
391}
392
393#[derive(Debug, Clone)]
395pub struct MetaExperienceBuffer {
396 pub trajectories: VecDeque<MetaTrajectory>,
398 pub max_size: usize,
400 pub class_weights: HashMap<String, f64>,
402}
403
404impl MetaExperienceBuffer {
405 pub fn new(_maxsize: usize) -> Self {
406 Self {
407 trajectories: VecDeque::with_capacity(_maxsize),
408 max_size: _maxsize,
409 class_weights: HashMap::new(),
410 }
411 }
412
413 pub fn add_trajectory(&mut self, trajectory: MetaTrajectory) {
414 let avg_reward = trajectory.experiences.iter().map(|e| e.reward).sum::<f64>()
416 / trajectory.experiences.len().max(1) as f64;
417
418 *self
419 .class_weights
420 .entry(trajectory.problem_class.clone())
421 .or_insert(1.0) *= if avg_reward > 0.0 { 1.05 } else { 0.95 };
422
423 self.trajectories.push_back(trajectory);
424 if self.trajectories.len() > self.max_size {
425 self.trajectories.pop_front();
426 }
427 }
428
429 pub fn sample_meta_batch(&self, batchsize: usize) -> Vec<MetaTrajectory> {
430 let mut batch = Vec::new();
431
432 for _ in 0..batchsize.min(self.trajectories.len()) {
433 let idx = scirs2_core::random::rng().random_range(0..self.trajectories.len());
435 if let Some(trajectory) = self.trajectories.get(idx) {
436 batch.push(trajectory.clone());
437 }
438 }
439
440 batch
441 }
442}
443
444impl AdvancedAdvancedPolicyGradientOptimizer {
445 pub fn new(config: RLOptimizationConfig, state_size: usize, actionsize: usize) -> Self {
447 let hidden_sizes = vec![state_size * 2, state_size * 3, state_size * 2];
448 let meta_policy = MetaPolicyNetwork::new(state_size, actionsize, hidden_sizes);
449
450 Self {
451 config,
452 meta_policy,
453 reward_function: ImprovementReward::default(),
454 meta_trajectories: VecDeque::with_capacity(1000),
455 problem_class_history: VecDeque::with_capacity(100),
456 best_params: Array1::zeros(state_size),
457 best_objective: f64::INFINITY,
458 meta_stats: MetaLearningStats {
459 avg_learning_rates: Array1::zeros(state_size),
460 meta_gradient_norms: VecDeque::with_capacity(1000),
461 problem_class_performance: HashMap::new(),
462 curriculum_progress: 0.0,
463 adaptation_efficiency: 1.0,
464 },
465 curriculum_controller: CurriculumController::new(),
466 meta_experience_buffer: MetaExperienceBuffer::new(500),
467 }
468 }
469
470 fn extract_meta_state_features(
472 &self,
473 state: &OptimizationState,
474 problem_class: &str,
475 ) -> (Array1<f64>, Array1<f64>) {
476 let mut base_features = Vec::new();
477
478 for ¶m in state.parameters.iter() {
480 base_features.push(param.tanh());
481 }
482
483 base_features.push((state.objective_value / (state.objective_value.abs() + 1.0)).tanh());
485 base_features.push(
486 state
487 .convergence_metrics
488 .relative_objective_change
489 .ln()
490 .max(-10.0)
491 .tanh(),
492 );
493 base_features.push(state.convergence_metrics.parameter_change_norm.tanh());
494
495 base_features.push((state.step as f64 / 100.0).tanh());
497
498 let problem_difficulty = self.meta_policy.curriculum_difficulty;
500 base_features.push(problem_difficulty);
501
502 let mut meta_context = Vec::new();
504
505 let class_performance = self
507 .meta_stats
508 .problem_class_performance
509 .get(problem_class)
510 .copied()
511 .unwrap_or(0.0);
512 meta_context.push(class_performance);
513
514 let recent_meta_grad_norm = self
516 .meta_stats
517 .meta_gradient_norms
518 .iter()
519 .rev()
520 .take(10)
521 .sum::<f64>()
522 / 10.0;
523 meta_context.push(recent_meta_grad_norm.tanh());
524
525 meta_context.push(self.meta_stats.curriculum_progress);
527
528 meta_context.push(self.meta_stats.adaptation_efficiency);
530
531 let recent_classes: std::collections::HashSet<String> = self
533 .problem_class_history
534 .iter()
535 .rev()
536 .take(10)
537 .cloned()
538 .collect();
539 meta_context.push((recent_classes.len() as f64 / 10.0).min(1.0));
540
541 (Array1::from(base_features), Array1::from(meta_context))
542 }
543
544 fn decode_meta_action(
546 &self,
547 policy_output: &ArrayView1<f64>,
548 meta_output: &ArrayView1<f64>,
549 ) -> OptimizationAction {
550 if policy_output.is_empty() {
551 return OptimizationAction::GradientStep {
552 learning_rate: 0.01,
553 };
554 }
555
556 let meta_modulation = meta_output.get(0).copied().unwrap_or(1.0);
558 let action_strength = meta_output.get(1).copied().unwrap_or(1.0);
559
560 let action_logits = policy_output.mapv(|x| x * meta_modulation);
562 let action_type = action_logits
563 .iter()
564 .enumerate()
565 .max_by(|a, b| a.1.partial_cmp(b.1).unwrap_or(std::cmp::Ordering::Equal))
566 .map(|(idx, _)| idx)
567 .unwrap_or(0);
568
569 match action_type {
570 0 => OptimizationAction::GradientStep {
571 learning_rate: 0.01 * action_strength * (1.0 + policy_output[0] * 0.5),
572 },
573 1 => OptimizationAction::RandomPerturbation {
574 magnitude: 0.1 * action_strength * (1.0 + policy_output[1] * 0.5),
575 },
576 2 => OptimizationAction::MomentumUpdate {
577 momentum: (0.9 * action_strength * (1.0 + policy_output[2] * 0.1)).min(0.99),
578 },
579 3 => OptimizationAction::AdaptiveLearningRate {
580 factor: (0.5 + 0.5 * policy_output[3] * action_strength)
581 .max(0.1)
582 .min(2.0),
583 },
584 4 => OptimizationAction::ResetToBest,
585 _ => OptimizationAction::Terminate,
586 }
587 }
588
589 fn compute_meta_gradients(&self, metabatch: &[MetaTrajectory]) -> MetaGradients {
591 let num_layers = self.meta_policy.layer_sizes.len() - 1;
592 let max_size = *self
593 .meta_policy
594 .layer_sizes
595 .iter()
596 .max()
597 .expect("Operation failed");
598
599 let mut meta_gradients = MetaGradients {
600 policy_gradients: Array3::zeros((num_layers, max_size, max_size)),
601 meta_weight_gradients: Array3::zeros((num_layers, max_size, max_size)),
602 meta_lr_gradients: Array3::zeros((num_layers, max_size, max_size)),
603 policy_bias_gradients: Array2::zeros((num_layers, max_size)),
604 meta_bias_gradients: Array2::zeros((num_layers, max_size)),
605 second_order_terms: Array3::zeros((num_layers, max_size, max_size)),
606 };
607
608 for trajectory in metabatch {
609 let trajectory_return: f64 = trajectory.experiences.iter().map(|e| e.reward).sum();
611
612 let learning_speed_bonus = trajectory.learning_metrics.convergence_speed * 0.1;
613 let exploration_bonus = trajectory.learning_metrics.exploration_efficiency * 0.05;
614 let adjusted_return = trajectory_return + learning_speed_bonus + exploration_bonus;
615
616 for (step, experience) in trajectory.experiences.iter().enumerate() {
618 let (state_features, meta_context) =
619 self.extract_meta_state_features(&experience.state, &trajectory.problem_class);
620
621 let gamma = self.config.discount_factor;
623 let step_return: f64 = trajectory.experiences[step..]
624 .iter()
625 .enumerate()
626 .map(|(i, e)| gamma.powi(i as i32) * e.reward)
627 .sum();
628
629 let advantage = step_return - adjusted_return / trajectory.experiences.len() as f64;
631
632 for layer in 0..num_layers {
634 for i in 0..self.meta_policy.layer_sizes[layer + 1] {
635 for j in 0..self.meta_policy.layer_sizes[layer] {
636 if j < state_features.len() {
637 meta_gradients.policy_gradients[[layer, i, j]] +=
639 advantage * state_features[j] * 0.01;
640
641 let meta_lr_grad = advantage
643 * state_features[j]
644 * trajectory.learning_metrics.convergence_speed;
645 meta_gradients.meta_lr_gradients[[layer, i, j]] +=
646 meta_lr_grad * 0.001;
647
648 if j < meta_context.len() {
650 meta_gradients.meta_weight_gradients[[layer, i, j]] +=
651 advantage * meta_context[j] * 0.001;
652 }
653 }
654 }
655
656 meta_gradients.policy_bias_gradients[[layer, i]] += advantage * 0.01;
658 meta_gradients.meta_bias_gradients[[layer, i]] +=
659 advantage * trajectory.learning_metrics.generalization_score * 0.001;
660 }
661 }
662 }
663 }
664
665 if !metabatch.is_empty() {
667 let batch_size = metabatch.len() as f64;
668 meta_gradients.policy_gradients /= batch_size;
669 meta_gradients.meta_weight_gradients /= batch_size;
670 meta_gradients.meta_lr_gradients /= batch_size;
671 meta_gradients.policy_bias_gradients /= batch_size;
672 meta_gradients.meta_bias_gradients /= batch_size;
673 }
674
675 meta_gradients
676 }
677
678 fn update_meta_stats(
680 &mut self,
681 meta_gradients: &MetaGradients,
682 problem_class: &str,
683 performance: f64,
684 ) {
685 let grad_norm = meta_gradients
687 .policy_gradients
688 .iter()
689 .map(|&g| g * g)
690 .sum::<f64>()
691 .sqrt();
692 self.meta_stats.meta_gradient_norms.push_back(grad_norm);
693 if self.meta_stats.meta_gradient_norms.len() > 1000 {
694 self.meta_stats.meta_gradient_norms.pop_front();
695 }
696
697 let current_perf = self
699 .meta_stats
700 .problem_class_performance
701 .entry(problem_class.to_string())
702 .or_insert(0.0);
703 *current_perf = 0.9 * *current_perf + 0.1 * performance;
704
705 self.meta_stats.curriculum_progress = self.curriculum_controller.difficulty_level;
707
708 let grad_stability = if self.meta_stats.meta_gradient_norms.len() > 10 {
710 let recent_grads: Vec<f64> = self
711 .meta_stats
712 .meta_gradient_norms
713 .iter()
714 .rev()
715 .take(10)
716 .cloned()
717 .collect();
718 let mean = recent_grads.iter().sum::<f64>() / recent_grads.len() as f64;
719 let variance = recent_grads
720 .iter()
721 .map(|&x| (x - mean).powi(2))
722 .sum::<f64>()
723 / recent_grads.len() as f64;
724 1.0 / (1.0 + variance)
725 } else {
726 1.0
727 };
728
729 self.meta_stats.adaptation_efficiency =
730 0.95 * self.meta_stats.adaptation_efficiency + 0.05 * grad_stability;
731 }
732
733 fn classify_problem<F>(&self, objective: &F, params: &ArrayView1<f64>) -> String
735 where
736 F: Fn(&ArrayView1<f64>) -> f64,
737 {
738 let base_value = objective(params);
740
741 let eps = 1e-6;
743 let mut curvature_sum = 0.0;
744
745 for i in 0..params.len().min(3) {
746 let mut params_plus = params.to_owned();
748 let mut params_minus = params.to_owned();
749 params_plus[i] += eps;
750 params_minus[i] -= eps;
751
752 let f_plus = objective(¶ms_plus.view());
753 let f_minus = objective(¶ms_minus.view());
754 let curvature = (f_plus + f_minus - 2.0 * base_value) / (eps * eps);
755 curvature_sum += curvature;
756 }
757
758 let avg_curvature = curvature_sum / params.len().min(3) as f64;
759
760 if avg_curvature > 1.0 {
761 "convex".to_string()
762 } else if avg_curvature < -1.0 {
763 "concave".to_string()
764 } else if base_value.abs() < 1.0 {
765 "low_scale".to_string()
766 } else if base_value.abs() > 100.0 {
767 "high_scale".to_string()
768 } else {
769 "general".to_string()
770 }
771 }
772}
773
774impl RLOptimizer for AdvancedAdvancedPolicyGradientOptimizer {
775 fn config(&self) -> &RLOptimizationConfig {
776 &self.config
777 }
778
779 fn select_action(&mut self, state: &OptimizationState) -> OptimizationAction {
780 let problem_class = "general"; let (state_features, meta_context) = self.extract_meta_state_features(state, problem_class);
782 let (policy_output, meta_output) =
783 self.meta_policy
784 .meta_forward(&state_features.view(), problem_class, &meta_context);
785 self.decode_meta_action(&policy_output.view(), &meta_output.view())
786 }
787
788 fn update(&mut self, experience: &Experience) -> Result<(), OptimizeError> {
789 Ok(())
791 }
792
793 fn run_episode<F>(
794 &mut self,
795 objective: &F,
796 initial_params: &ArrayView1<f64>,
797 ) -> OptimizeResult<OptimizeResults<f64>>
798 where
799 F: Fn(&ArrayView1<f64>) -> f64,
800 {
801 let problem_class = self.classify_problem(objective, initial_params);
802 self.problem_class_history.push_back(problem_class.clone());
803 if self.problem_class_history.len() > 100 {
804 self.problem_class_history.pop_front();
805 }
806
807 let initial_meta_context = Array1::from(vec![
808 self.meta_stats.curriculum_progress,
809 self.meta_stats.adaptation_efficiency,
810 self.curriculum_controller.difficulty_level,
811 ]);
812
813 let mut current_params = initial_params.to_owned();
814 let mut current_state = utils::create_state(current_params.clone(), objective, 0, None);
815 let mut experiences = Vec::new();
816 let mut momentum = Array1::zeros(initial_params.len());
817
818 let start_objective = current_state.objective_value;
819 let mut max_improvement = 0.0;
820 let mut exploration_steps = 0;
821
822 for step in 0..self.config.max_steps_per_episode {
823 let action = self.select_action(¤t_state);
825
826 let new_params =
828 utils::apply_action(¤t_state, &action, &self.best_params, &mut momentum);
829 let new_state =
830 utils::create_state(new_params, objective, step + 1, Some(¤t_state));
831
832 let base_reward =
834 self.reward_function
835 .compute_reward(¤t_state, &action, &new_state);
836 let exploration_bonus =
837 if matches!(action, OptimizationAction::RandomPerturbation { .. }) {
838 exploration_steps += 1;
839 0.01
840 } else {
841 0.0
842 };
843 let reward = base_reward + exploration_bonus;
844
845 let improvement = current_state.objective_value - new_state.objective_value;
847 if improvement > max_improvement {
848 max_improvement = improvement;
849 }
850
851 let experience = Experience {
853 state: current_state.clone(),
854 action: action.clone(),
855 reward,
856 next_state: new_state.clone(),
857 done: utils::should_terminate(&new_state, self.config.max_steps_per_episode),
858 };
859 experiences.push(experience);
860
861 if new_state.objective_value < self.best_objective {
863 self.best_objective = new_state.objective_value;
864 self.best_params = new_state.parameters.clone();
865 }
866
867 current_state = new_state;
868 current_params = current_state.parameters.clone();
869
870 if utils::should_terminate(¤t_state, self.config.max_steps_per_episode)
872 || matches!(action, OptimizationAction::Terminate)
873 {
874 break;
875 }
876 }
877
878 let final_objective = current_state.objective_value;
880 let total_improvement = start_objective - final_objective;
881 let learning_metrics = LearningMetrics {
882 improvement_rate: total_improvement / (current_state.step as f64 + 1.0),
883 convergence_speed: if total_improvement > 0.0 {
884 max_improvement / total_improvement
885 } else {
886 0.0
887 },
888 exploration_efficiency: (exploration_steps as f64) / (current_state.step as f64 + 1.0),
889 generalization_score: if total_improvement > 0.0 {
890 (total_improvement / start_objective.abs()).min(1.0)
891 } else {
892 0.0
893 },
894 };
895
896 let meta_trajectory = MetaTrajectory {
898 experiences,
899 problem_class: problem_class.clone(),
900 initial_meta_context,
901 learning_metrics: learning_metrics.clone(),
902 adaptation_speed: learning_metrics.improvement_rate.abs(),
903 };
904
905 self.meta_experience_buffer.add_trajectory(meta_trajectory);
907
908 let episode_performance = learning_metrics.generalization_score;
910 self.curriculum_controller
911 .update_progress(episode_performance);
912
913 Ok(OptimizeResults::<f64> {
914 x: current_params,
915 fun: current_state.objective_value,
916 success: current_state.convergence_metrics.relative_objective_change < 1e-6,
917 nit: current_state.step,
918 nfev: current_state.step, njev: 0,
920 nhev: 0,
921 maxcv: 0,
922 status: 0,
923 message: format!(
924 "Meta-policy gradient episode completed for problem class: {}",
925 problem_class
926 ),
927 jac: None,
928 hess: None,
929 constr: None,
930 })
931 }
932
933 fn train<F>(
934 &mut self,
935 objective: &F,
936 initial_params: &ArrayView1<f64>,
937 ) -> OptimizeResult<OptimizeResults<f64>>
938 where
939 F: Fn(&ArrayView1<f64>) -> f64,
940 {
941 let mut best_result = OptimizeResults::<f64> {
942 x: initial_params.to_owned(),
943 fun: f64::INFINITY,
944 success: false,
945 nit: 0,
946 nfev: 0,
947 njev: 0,
948 nhev: 0,
949 maxcv: 0,
950 status: 0,
951 message: "Meta-learning training not completed".to_string(),
952 jac: None,
953 hess: None,
954 constr: None,
955 };
956
957 for episode in 0..self.config.num_episodes {
959 let result = self.run_episode(objective, initial_params)?;
960
961 if result.fun < best_result.fun {
962 best_result = result;
963 }
964
965 if (episode + 1) % 5 == 0 && self.meta_experience_buffer.trajectories.len() >= 10 {
967 let meta_batch = self.meta_experience_buffer.sample_meta_batch(10);
968 let meta_gradients = self.compute_meta_gradients(&meta_batch);
969
970 self.meta_policy.meta_update(
972 &meta_gradients,
973 self.config.learning_rate,
974 self.config.learning_rate * 0.1,
975 );
976
977 let avg_performance = meta_batch
979 .iter()
980 .map(|t| t.learning_metrics.generalization_score)
981 .sum::<f64>()
982 / meta_batch.len() as f64;
983
984 if let Some(trajectory) = meta_batch.first() {
985 self.update_meta_stats(
986 &meta_gradients,
987 &trajectory.problem_class,
988 avg_performance,
989 );
990 }
991 }
992 }
993
994 best_result.x = self.best_params.clone();
995 best_result.fun = self.best_objective;
996 best_result.message = format!(
997 "Meta-learning training completed. Curriculum level: {:.3}, Adaptation efficiency: {:.3}",
998 self.meta_stats.curriculum_progress,
999 self.meta_stats.adaptation_efficiency
1000 );
1001
1002 Ok(best_result)
1003 }
1004
1005 fn reset(&mut self) {
1006 self.meta_trajectories.clear();
1007 self.problem_class_history.clear();
1008 self.best_objective = f64::INFINITY;
1009 self.best_params.fill(0.0);
1010 self.meta_stats.meta_gradient_norms.clear();
1011 self.meta_stats.problem_class_performance.clear();
1012 self.curriculum_controller = CurriculumController::new();
1013 self.meta_experience_buffer = MetaExperienceBuffer::new(500);
1014 }
1015}
1016
1017#[allow(dead_code)]
1019pub fn advanced_advanced_policy_gradient_optimize<F>(
1020 objective: F,
1021 initial_params: &ArrayView1<f64>,
1022 config: Option<RLOptimizationConfig>,
1023) -> OptimizeResult<OptimizeResults<f64>>
1024where
1025 F: Fn(&ArrayView1<f64>) -> f64,
1026{
1027 let config = config.unwrap_or_else(|| RLOptimizationConfig {
1028 num_episodes: 100,
1029 max_steps_per_episode: 50,
1030 learning_rate: 0.001,
1031 ..Default::default()
1032 });
1033
1034 let mut optimizer = AdvancedAdvancedPolicyGradientOptimizer::new(
1035 config,
1036 initial_params.len() + 5, 6, );
1039 optimizer.train(&objective, initial_params)
1040}
1041
1042#[allow(dead_code)]
1044pub fn policy_gradient_optimize<F>(
1045 objective: F,
1046 initial_params: &ArrayView1<f64>,
1047 config: Option<RLOptimizationConfig>,
1048) -> OptimizeResult<OptimizeResults<f64>>
1049where
1050 F: Fn(&ArrayView1<f64>) -> f64,
1051{
1052 advanced_advanced_policy_gradient_optimize(objective, initial_params, config)
1053}
1054
1055#[cfg(test)]
1056mod tests {
1057 use super::*;
1058
1059 #[test]
1060 fn test_meta_policy_network_creation() {
1061 let network = MetaPolicyNetwork::new(4, 2, vec![8, 6]);
1062 assert_eq!(network.layer_sizes, vec![4, 8, 6, 2]);
1063 }
1064
1065 #[test]
1066 fn test_meta_forward_pass() {
1067 let mut network = MetaPolicyNetwork::new(3, 2, vec![4]);
1068 let input = Array1::from(vec![0.5, -0.3, 0.8]);
1069 let meta_context = Array1::from(vec![0.1, 0.2]);
1070
1071 let (policy_out, meta_out) = network.meta_forward(&input.view(), "test", &meta_context);
1072
1073 assert_eq!(policy_out.len(), 2);
1074 assert_eq!(meta_out.len(), 2);
1075 }
1076
1077 #[test]
1078 fn test_curriculum_controller() {
1079 let mut controller = CurriculumController::new();
1080 assert_eq!(controller.difficulty_level, 0.1);
1081
1082 for _ in 0..25 {
1084 controller.update_progress(0.9);
1085 }
1086
1087 assert!(controller.difficulty_level > 0.1);
1088 }
1089
1090 #[test]
1091 fn test_meta_experience_buffer() {
1092 let mut buffer = MetaExperienceBuffer::new(10);
1093
1094 let trajectory = MetaTrajectory {
1095 experiences: vec![],
1096 problem_class: "test".to_string(),
1097 initial_meta_context: Array1::zeros(3),
1098 learning_metrics: LearningMetrics {
1099 improvement_rate: 0.1,
1100 convergence_speed: 0.2,
1101 exploration_efficiency: 0.3,
1102 generalization_score: 0.4,
1103 },
1104 adaptation_speed: 0.1,
1105 };
1106
1107 buffer.add_trajectory(trajectory);
1108 assert_eq!(buffer.trajectories.len(), 1);
1109
1110 let batch = buffer.sample_meta_batch(1);
1111 assert_eq!(batch.len(), 1);
1112 }
1113
1114 #[test]
1115 fn test_advanced_advanced_optimizer_creation() {
1116 let config = RLOptimizationConfig::default();
1117 let optimizer = AdvancedAdvancedPolicyGradientOptimizer::new(config, 4, 3);
1118
1119 assert_eq!(optimizer.meta_policy.layer_sizes[0], 4);
1120 assert_eq!(optimizer.meta_policy.layer_sizes.last(), Some(&3));
1121 }
1122
1123 #[test]
1124 fn test_problem_classification() {
1125 let config = RLOptimizationConfig::default();
1126 let optimizer = AdvancedAdvancedPolicyGradientOptimizer::new(config, 2, 3);
1127
1128 let quadratic = |x: &ArrayView1<f64>| x[0].powi(2) + x[1].powi(2);
1129 let params = Array1::from(vec![1.0, 1.0]);
1130
1131 let class = optimizer.classify_problem(&quadratic, ¶ms.view());
1132 assert!(!class.is_empty());
1133 }
1134
1135 #[test]
1136 fn test_meta_optimization() {
1137 let config = RLOptimizationConfig {
1138 num_episodes: 50,
1139 max_steps_per_episode: 50,
1140 learning_rate: 0.05,
1141 ..Default::default()
1142 };
1143
1144 let objective = |x: &ArrayView1<f64>| (x[0] - 1.0).powi(2) + (x[1] + 0.5).powi(2);
1145 let initial = Array1::from(vec![0.0, 0.0]);
1146
1147 let result =
1148 advanced_advanced_policy_gradient_optimize(objective, &initial.view(), Some(config))
1149 .expect("Operation failed");
1150
1151 assert!(result.nit > 0);
1152 assert!(result.fun <= objective(&initial.view()) * 1.01);
1153 }
1154}
1155
1156#[allow(dead_code)]
1157pub fn placeholder() {
1158 }