1use super::{
11 utils, Experience, ImprovementReward, OptimizationAction, OptimizationState,
12 RLOptimizationConfig, RLOptimizer, RewardFunction,
13};
14use crate::error::{OptimizeError, OptimizeResult};
15use crate::result::OptimizeResults;
16use scirs2_core::ndarray::{Array1, Array2, Array3, ArrayView1};
17use scirs2_core::random::{rng, Rng};
20use std::collections::{HashMap, VecDeque};
21
22#[derive(Debug, Clone)]
24pub struct MetaPolicyNetwork {
25 pub policy_weights: Array3<f64>, pub meta_weights: Array3<f64>,
29 pub policy_bias: Array2<f64>, pub meta_bias: Array2<f64>,
32 pub layer_sizes: Vec<usize>,
34 pub adaptive_learning_rates: Array2<f64>,
36 pub meta_gradient_accumulator: Array3<f64>,
38 pub second_order_info: Array3<f64>,
40 pub curriculum_difficulty: f64,
42 pub problem_embeddings: HashMap<String, Array1<f64>>,
44}
45
46impl MetaPolicyNetwork {
47 pub fn new(_input_size: usize, output_size: usize, hidden_sizes: Vec<usize>) -> Self {
49 let mut layer_sizes = vec![_input_size];
50 layer_sizes.extend(hidden_sizes);
51 layer_sizes.push(output_size);
52
53 let num_layers = layer_sizes.len() - 1;
54 let max_layer_size = *layer_sizes.iter().max().unwrap();
55
56 let mut policy_weights = Array3::zeros((num_layers, max_layer_size, max_layer_size));
58 let mut meta_weights = Array3::zeros((num_layers, max_layer_size, max_layer_size));
59
60 for layer in 0..num_layers {
61 let fan_in = layer_sizes[layer];
62 let fan_out = layer_sizes[layer + 1];
63 let xavier_std = (2.0 / (fan_in + fan_out) as f64).sqrt();
64
65 for i in 0..fan_out {
66 for j in 0..fan_in {
67 policy_weights[[layer, i, j]] =
68 scirs2_core::random::rng().random_range(-0.5..0.5) * 2.0 * xavier_std;
69 meta_weights[[layer, i, j]] =
70 scirs2_core::random::rng().random_range(-0.5..0.5) * 2.0 * xavier_std * 0.1;
71 }
72 }
73 }
74
75 Self {
76 policy_weights,
77 meta_weights,
78 policy_bias: Array2::zeros((num_layers, max_layer_size)),
79 meta_bias: Array2::zeros((num_layers, max_layer_size)),
80 layer_sizes,
81 adaptive_learning_rates: Array2::from_elem((num_layers, max_layer_size), 0.01),
82 meta_gradient_accumulator: Array3::zeros((num_layers, max_layer_size, max_layer_size)),
83 second_order_info: Array3::zeros((num_layers, max_layer_size, max_layer_size)),
84 curriculum_difficulty: 0.1,
85 problem_embeddings: HashMap::new(),
86 }
87 }
88
89 pub fn meta_forward(
91 &mut self,
92 state_features: &ArrayView1<f64>,
93 problem_class: &str,
94 meta_context: &Array1<f64>,
95 ) -> (Array1<f64>, Array1<f64>) {
96 let problem_embedding =
98 self.get_or_create_problem_embedding(problem_class, state_features.len());
99
100 let mut augmented_input = state_features.to_owned();
102
103 for (i, &emb) in problem_embedding.iter().enumerate() {
105 if i < augmented_input.len() {
106 augmented_input[i] += emb * 0.1;
107 }
108 }
109
110 let policy_output = self.forward_policy(&augmented_input.view());
112
113 let meta_output = self.forward_meta(&augmented_input.view(), meta_context);
115
116 (policy_output, meta_output)
117 }
118
119 fn forward_policy(&self, input: &ArrayView1<f64>) -> Array1<f64> {
120 let mut current_input = input.to_owned();
121
122 for layer in 0..(self.layer_sizes.len() - 1) {
123 let layer_input_size = self.layer_sizes[layer];
124 let layer_output_size = self.layer_sizes[layer + 1];
125
126 let mut layer_output = Array1::<f64>::zeros(layer_output_size);
127
128 for i in 0..layer_output_size {
129 for j in 0..layer_input_size.min(current_input.len()) {
130 layer_output[i] += self.policy_weights[[layer, i, j]] * current_input[j];
131 }
132 layer_output[i] += self.policy_bias[[layer, i]];
133
134 layer_output[i] = if layer_output[i] > 0.0 {
136 layer_output[i]
137 } else {
138 layer_output[i].exp() - 1.0
139 };
140 }
141
142 current_input = layer_output;
143 }
144
145 current_input
146 }
147
148 fn forward_meta(&self, input: &ArrayView1<f64>, metacontext: &Array1<f64>) -> Array1<f64> {
149 let mut meta_input = input.to_owned();
151 for (i, &ctx) in metacontext.iter().enumerate() {
152 if i < meta_input.len() {
153 meta_input[i] += ctx * 0.05;
154 }
155 }
156
157 let mut current_input = meta_input;
158
159 for layer in 0..(self.layer_sizes.len() - 1) {
160 let layer_input_size = self.layer_sizes[layer];
161 let layer_output_size = self.layer_sizes[layer + 1];
162
163 let mut layer_output = Array1::<f64>::zeros(layer_output_size);
164
165 for i in 0..layer_output_size {
166 for j in 0..layer_input_size.min(current_input.len()) {
167 layer_output[i] += self.meta_weights[[layer, i, j]] * current_input[j];
168 }
169 layer_output[i] += self.meta_bias[[layer, i]];
170
171 layer_output[i] = 1.0 / (1.0 + (-layer_output[i]).exp());
173 }
174
175 current_input = layer_output;
176 }
177
178 current_input
179 }
180
181 fn get_or_create_problem_embedding(
182 &mut self,
183 problem_class: &str,
184 input_size: usize,
185 ) -> Array1<f64> {
186 if let Some(embedding) = self.problem_embeddings.get(problem_class) {
187 embedding.clone()
188 } else {
189 let embedding = Array1::from_shape_fn(input_size, |_| {
190 scirs2_core::random::rng().random_range(-0.05..0.05)
191 });
192 self.problem_embeddings
193 .insert(problem_class.to_string(), embedding.clone());
194 embedding
195 }
196 }
197
198 pub fn meta_update(
200 &mut self,
201 meta_gradients: &MetaGradients,
202 base_learning_rate: f64,
203 meta_learning_rate: f64,
204 ) {
205 for layer in 0..(self.layer_sizes.len() - 1) {
207 for i in 0..self.layer_sizes[layer + 1] {
208 for j in 0..self.layer_sizes[layer] {
209 let meta_grad = meta_gradients.meta_lr_gradients[[layer, i, j]];
211 self.adaptive_learning_rates[[layer, i]] *=
212 (1.0 + meta_learning_rate * meta_grad).max(0.1).min(10.0);
213
214 let adaptive_lr = self.adaptive_learning_rates[[layer, i]] * base_learning_rate;
216 self.policy_weights[[layer, i, j]] +=
217 adaptive_lr * meta_gradients.policy_gradients[[layer, i, j]];
218
219 self.meta_weights[[layer, i, j]] +=
221 meta_learning_rate * meta_gradients.meta_weight_gradients[[layer, i, j]];
222 }
223
224 let adaptive_lr = self.adaptive_learning_rates[[layer, i]] * base_learning_rate;
226 self.policy_bias[[layer, i]] +=
227 adaptive_lr * meta_gradients.policy_bias_gradients[[layer, i]];
228 self.meta_bias[[layer, i]] +=
229 meta_learning_rate * meta_gradients.meta_bias_gradients[[layer, i]];
230 }
231 }
232
233 self.update_curriculum_difficulty(meta_gradients);
235 }
236
237 fn update_curriculum_difficulty(&mut self, metagradients: &MetaGradients) {
238 let gradient_norm = metagradients
239 .policy_gradients
240 .iter()
241 .map(|&g| g * g)
242 .sum::<f64>()
243 .sqrt();
244
245 if gradient_norm < 0.1 {
246 self.curriculum_difficulty = (self.curriculum_difficulty * 1.05).min(1.0);
247 } else if gradient_norm > 1.0 {
248 self.curriculum_difficulty = (self.curriculum_difficulty * 0.95).max(0.01);
249 }
250 }
251}
252
253#[derive(Debug, Clone)]
255pub struct MetaGradients {
256 pub policy_gradients: Array3<f64>,
258 pub meta_weight_gradients: Array3<f64>,
260 pub meta_lr_gradients: Array3<f64>,
262 pub policy_bias_gradients: Array2<f64>,
264 pub meta_bias_gradients: Array2<f64>,
265 pub second_order_terms: Array3<f64>,
267}
268
269#[derive(Debug, Clone)]
271pub struct AdvancedAdvancedPolicyGradientOptimizer {
272 config: RLOptimizationConfig,
274 meta_policy: MetaPolicyNetwork,
276 reward_function: ImprovementReward,
278 meta_trajectories: VecDeque<MetaTrajectory>,
280 problem_class_history: VecDeque<String>,
282 best_params: Array1<f64>,
284 best_objective: f64,
285 meta_stats: MetaLearningStats,
287 curriculum_controller: CurriculumController,
289 meta_experience_buffer: MetaExperienceBuffer,
291}
292
293#[derive(Debug, Clone)]
295pub struct MetaTrajectory {
296 pub experiences: Vec<Experience>,
298 pub problem_class: String,
300 pub initial_meta_context: Array1<f64>,
302 pub learning_metrics: LearningMetrics,
304 pub adaptation_speed: f64,
306}
307
308#[derive(Debug, Clone)]
310pub struct LearningMetrics {
311 pub improvement_rate: f64,
313 pub convergence_speed: f64,
315 pub exploration_efficiency: f64,
317 pub generalization_score: f64,
319}
320
321#[derive(Debug, Clone)]
323pub struct MetaLearningStats {
324 pub avg_learning_rates: Array1<f64>,
326 pub meta_gradient_norms: VecDeque<f64>,
328 pub problem_class_performance: HashMap<String, f64>,
330 pub curriculum_progress: f64,
332 pub adaptation_efficiency: f64,
334}
335
336#[derive(Debug, Clone)]
338pub struct CurriculumController {
339 pub difficulty_level: f64,
341 pub advancement_thresholds: Vec<f64>,
343 pub difficulty_generators: HashMap<String, f64>,
345 pub progress_tracker: VecDeque<f64>,
347}
348
349impl Default for CurriculumController {
350 fn default() -> Self {
351 Self::new()
352 }
353}
354
355impl CurriculumController {
356 pub fn new() -> Self {
357 Self {
358 difficulty_level: 0.1,
359 advancement_thresholds: vec![0.8, 0.85, 0.9, 0.95],
360 difficulty_generators: HashMap::new(),
361 progress_tracker: VecDeque::with_capacity(100),
362 }
363 }
364
365 pub fn should_advance(&self) -> bool {
366 if self.progress_tracker.len() < 20 {
367 return false;
368 }
369
370 let recent_performance: f64 =
371 self.progress_tracker.iter().rev().take(20).sum::<f64>() / 20.0;
372
373 let threshold_idx = ((self.difficulty_level * 4.0) as usize).min(3);
374 recent_performance > self.advancement_thresholds[threshold_idx]
375 }
376
377 pub fn advance_difficulty(&mut self) {
378 self.difficulty_level = (self.difficulty_level * 1.2).min(1.0);
379 }
380
381 pub fn update_progress(&mut self, performance: f64) {
382 self.progress_tracker.push_back(performance);
383 if self.progress_tracker.len() > 100 {
384 self.progress_tracker.pop_front();
385 }
386
387 if self.should_advance() {
388 self.advance_difficulty();
389 }
390 }
391}
392
393#[derive(Debug, Clone)]
395pub struct MetaExperienceBuffer {
396 pub trajectories: VecDeque<MetaTrajectory>,
398 pub max_size: usize,
400 pub class_weights: HashMap<String, f64>,
402}
403
404impl MetaExperienceBuffer {
405 pub fn new(_maxsize: usize) -> Self {
406 Self {
407 trajectories: VecDeque::with_capacity(_maxsize),
408 max_size: _maxsize,
409 class_weights: HashMap::new(),
410 }
411 }
412
413 pub fn add_trajectory(&mut self, trajectory: MetaTrajectory) {
414 let avg_reward = trajectory.experiences.iter().map(|e| e.reward).sum::<f64>()
416 / trajectory.experiences.len().max(1) as f64;
417
418 *self
419 .class_weights
420 .entry(trajectory.problem_class.clone())
421 .or_insert(1.0) *= if avg_reward > 0.0 { 1.05 } else { 0.95 };
422
423 self.trajectories.push_back(trajectory);
424 if self.trajectories.len() > self.max_size {
425 self.trajectories.pop_front();
426 }
427 }
428
429 pub fn sample_meta_batch(&self, batchsize: usize) -> Vec<MetaTrajectory> {
430 let mut batch = Vec::new();
431
432 for _ in 0..batchsize.min(self.trajectories.len()) {
433 let idx = scirs2_core::random::rng().random_range(0..self.trajectories.len());
435 if let Some(trajectory) = self.trajectories.get(idx) {
436 batch.push(trajectory.clone());
437 }
438 }
439
440 batch
441 }
442}
443
444impl AdvancedAdvancedPolicyGradientOptimizer {
445 pub fn new(config: RLOptimizationConfig, state_size: usize, actionsize: usize) -> Self {
447 let hidden_sizes = vec![state_size * 2, state_size * 3, state_size * 2];
448 let meta_policy = MetaPolicyNetwork::new(state_size, actionsize, hidden_sizes);
449
450 Self {
451 config,
452 meta_policy,
453 reward_function: ImprovementReward::default(),
454 meta_trajectories: VecDeque::with_capacity(1000),
455 problem_class_history: VecDeque::with_capacity(100),
456 best_params: Array1::zeros(state_size),
457 best_objective: f64::INFINITY,
458 meta_stats: MetaLearningStats {
459 avg_learning_rates: Array1::zeros(state_size),
460 meta_gradient_norms: VecDeque::with_capacity(1000),
461 problem_class_performance: HashMap::new(),
462 curriculum_progress: 0.0,
463 adaptation_efficiency: 1.0,
464 },
465 curriculum_controller: CurriculumController::new(),
466 meta_experience_buffer: MetaExperienceBuffer::new(500),
467 }
468 }
469
470 fn extract_meta_state_features(
472 &self,
473 state: &OptimizationState,
474 problem_class: &str,
475 ) -> (Array1<f64>, Array1<f64>) {
476 let mut base_features = Vec::new();
477
478 for ¶m in state.parameters.iter() {
480 base_features.push(param.tanh());
481 }
482
483 base_features.push((state.objective_value / (state.objective_value.abs() + 1.0)).tanh());
485 base_features.push(
486 state
487 .convergence_metrics
488 .relative_objective_change
489 .ln()
490 .max(-10.0)
491 .tanh(),
492 );
493 base_features.push(state.convergence_metrics.parameter_change_norm.tanh());
494
495 base_features.push((state.step as f64 / 100.0).tanh());
497
498 let problem_difficulty = self.meta_policy.curriculum_difficulty;
500 base_features.push(problem_difficulty);
501
502 let mut meta_context = Vec::new();
504
505 let class_performance = self
507 .meta_stats
508 .problem_class_performance
509 .get(problem_class)
510 .copied()
511 .unwrap_or(0.0);
512 meta_context.push(class_performance);
513
514 let recent_meta_grad_norm = self
516 .meta_stats
517 .meta_gradient_norms
518 .iter()
519 .rev()
520 .take(10)
521 .sum::<f64>()
522 / 10.0;
523 meta_context.push(recent_meta_grad_norm.tanh());
524
525 meta_context.push(self.meta_stats.curriculum_progress);
527
528 meta_context.push(self.meta_stats.adaptation_efficiency);
530
531 let recent_classes: std::collections::HashSet<String> = self
533 .problem_class_history
534 .iter()
535 .rev()
536 .take(10)
537 .cloned()
538 .collect();
539 meta_context.push((recent_classes.len() as f64 / 10.0).min(1.0));
540
541 (Array1::from(base_features), Array1::from(meta_context))
542 }
543
544 fn decode_meta_action(
546 &self,
547 policy_output: &ArrayView1<f64>,
548 meta_output: &ArrayView1<f64>,
549 ) -> OptimizationAction {
550 if policy_output.is_empty() {
551 return OptimizationAction::GradientStep {
552 learning_rate: 0.01,
553 };
554 }
555
556 let meta_modulation = meta_output.get(0).copied().unwrap_or(1.0);
558 let action_strength = meta_output.get(1).copied().unwrap_or(1.0);
559
560 let action_logits = policy_output.mapv(|x| x * meta_modulation);
562 let action_type = action_logits
563 .iter()
564 .enumerate()
565 .max_by(|a, b| a.1.partial_cmp(b.1).unwrap_or(std::cmp::Ordering::Equal))
566 .map(|(idx, _)| idx)
567 .unwrap_or(0);
568
569 match action_type {
570 0 => OptimizationAction::GradientStep {
571 learning_rate: 0.01 * action_strength * (1.0 + policy_output[0] * 0.5),
572 },
573 1 => OptimizationAction::RandomPerturbation {
574 magnitude: 0.1 * action_strength * (1.0 + policy_output[1] * 0.5),
575 },
576 2 => OptimizationAction::MomentumUpdate {
577 momentum: (0.9 * action_strength * (1.0 + policy_output[2] * 0.1)).min(0.99),
578 },
579 3 => OptimizationAction::AdaptiveLearningRate {
580 factor: (0.5 + 0.5 * policy_output[3] * action_strength)
581 .max(0.1)
582 .min(2.0),
583 },
584 4 => OptimizationAction::ResetToBest,
585 _ => OptimizationAction::Terminate,
586 }
587 }
588
589 fn compute_meta_gradients(&self, metabatch: &[MetaTrajectory]) -> MetaGradients {
591 let num_layers = self.meta_policy.layer_sizes.len() - 1;
592 let max_size = *self.meta_policy.layer_sizes.iter().max().unwrap();
593
594 let mut meta_gradients = MetaGradients {
595 policy_gradients: Array3::zeros((num_layers, max_size, max_size)),
596 meta_weight_gradients: Array3::zeros((num_layers, max_size, max_size)),
597 meta_lr_gradients: Array3::zeros((num_layers, max_size, max_size)),
598 policy_bias_gradients: Array2::zeros((num_layers, max_size)),
599 meta_bias_gradients: Array2::zeros((num_layers, max_size)),
600 second_order_terms: Array3::zeros((num_layers, max_size, max_size)),
601 };
602
603 for trajectory in metabatch {
604 let trajectory_return: f64 = trajectory.experiences.iter().map(|e| e.reward).sum();
606
607 let learning_speed_bonus = trajectory.learning_metrics.convergence_speed * 0.1;
608 let exploration_bonus = trajectory.learning_metrics.exploration_efficiency * 0.05;
609 let adjusted_return = trajectory_return + learning_speed_bonus + exploration_bonus;
610
611 for (step, experience) in trajectory.experiences.iter().enumerate() {
613 let (state_features, meta_context) =
614 self.extract_meta_state_features(&experience.state, &trajectory.problem_class);
615
616 let gamma = self.config.discount_factor;
618 let step_return: f64 = trajectory.experiences[step..]
619 .iter()
620 .enumerate()
621 .map(|(i, e)| gamma.powi(i as i32) * e.reward)
622 .sum();
623
624 let advantage = step_return - adjusted_return / trajectory.experiences.len() as f64;
626
627 for layer in 0..num_layers {
629 for i in 0..self.meta_policy.layer_sizes[layer + 1] {
630 for j in 0..self.meta_policy.layer_sizes[layer] {
631 if j < state_features.len() {
632 meta_gradients.policy_gradients[[layer, i, j]] +=
634 advantage * state_features[j] * 0.01;
635
636 let meta_lr_grad = advantage
638 * state_features[j]
639 * trajectory.learning_metrics.convergence_speed;
640 meta_gradients.meta_lr_gradients[[layer, i, j]] +=
641 meta_lr_grad * 0.001;
642
643 if j < meta_context.len() {
645 meta_gradients.meta_weight_gradients[[layer, i, j]] +=
646 advantage * meta_context[j] * 0.001;
647 }
648 }
649 }
650
651 meta_gradients.policy_bias_gradients[[layer, i]] += advantage * 0.01;
653 meta_gradients.meta_bias_gradients[[layer, i]] +=
654 advantage * trajectory.learning_metrics.generalization_score * 0.001;
655 }
656 }
657 }
658 }
659
660 if !metabatch.is_empty() {
662 let batch_size = metabatch.len() as f64;
663 meta_gradients.policy_gradients /= batch_size;
664 meta_gradients.meta_weight_gradients /= batch_size;
665 meta_gradients.meta_lr_gradients /= batch_size;
666 meta_gradients.policy_bias_gradients /= batch_size;
667 meta_gradients.meta_bias_gradients /= batch_size;
668 }
669
670 meta_gradients
671 }
672
673 fn update_meta_stats(
675 &mut self,
676 meta_gradients: &MetaGradients,
677 problem_class: &str,
678 performance: f64,
679 ) {
680 let grad_norm = meta_gradients
682 .policy_gradients
683 .iter()
684 .map(|&g| g * g)
685 .sum::<f64>()
686 .sqrt();
687 self.meta_stats.meta_gradient_norms.push_back(grad_norm);
688 if self.meta_stats.meta_gradient_norms.len() > 1000 {
689 self.meta_stats.meta_gradient_norms.pop_front();
690 }
691
692 let current_perf = self
694 .meta_stats
695 .problem_class_performance
696 .entry(problem_class.to_string())
697 .or_insert(0.0);
698 *current_perf = 0.9 * *current_perf + 0.1 * performance;
699
700 self.meta_stats.curriculum_progress = self.curriculum_controller.difficulty_level;
702
703 let grad_stability = if self.meta_stats.meta_gradient_norms.len() > 10 {
705 let recent_grads: Vec<f64> = self
706 .meta_stats
707 .meta_gradient_norms
708 .iter()
709 .rev()
710 .take(10)
711 .cloned()
712 .collect();
713 let mean = recent_grads.iter().sum::<f64>() / recent_grads.len() as f64;
714 let variance = recent_grads
715 .iter()
716 .map(|&x| (x - mean).powi(2))
717 .sum::<f64>()
718 / recent_grads.len() as f64;
719 1.0 / (1.0 + variance)
720 } else {
721 1.0
722 };
723
724 self.meta_stats.adaptation_efficiency =
725 0.95 * self.meta_stats.adaptation_efficiency + 0.05 * grad_stability;
726 }
727
728 fn classify_problem<F>(&self, objective: &F, params: &ArrayView1<f64>) -> String
730 where
731 F: Fn(&ArrayView1<f64>) -> f64,
732 {
733 let base_value = objective(params);
735
736 let eps = 1e-6;
738 let mut curvature_sum = 0.0;
739
740 for i in 0..params.len().min(3) {
741 let mut params_plus = params.to_owned();
743 let mut params_minus = params.to_owned();
744 params_plus[i] += eps;
745 params_minus[i] -= eps;
746
747 let f_plus = objective(¶ms_plus.view());
748 let f_minus = objective(¶ms_minus.view());
749 let curvature = (f_plus + f_minus - 2.0 * base_value) / (eps * eps);
750 curvature_sum += curvature;
751 }
752
753 let avg_curvature = curvature_sum / params.len().min(3) as f64;
754
755 if avg_curvature > 1.0 {
756 "convex".to_string()
757 } else if avg_curvature < -1.0 {
758 "concave".to_string()
759 } else if base_value.abs() < 1.0 {
760 "low_scale".to_string()
761 } else if base_value.abs() > 100.0 {
762 "high_scale".to_string()
763 } else {
764 "general".to_string()
765 }
766 }
767}
768
769impl RLOptimizer for AdvancedAdvancedPolicyGradientOptimizer {
770 fn config(&self) -> &RLOptimizationConfig {
771 &self.config
772 }
773
774 fn select_action(&mut self, state: &OptimizationState) -> OptimizationAction {
775 let problem_class = "general"; let (state_features, meta_context) = self.extract_meta_state_features(state, problem_class);
777 let (policy_output, meta_output) =
778 self.meta_policy
779 .meta_forward(&state_features.view(), problem_class, &meta_context);
780 self.decode_meta_action(&policy_output.view(), &meta_output.view())
781 }
782
783 fn update(&mut self, experience: &Experience) -> Result<(), OptimizeError> {
784 Ok(())
786 }
787
788 fn run_episode<F>(
789 &mut self,
790 objective: &F,
791 initial_params: &ArrayView1<f64>,
792 ) -> OptimizeResult<OptimizeResults<f64>>
793 where
794 F: Fn(&ArrayView1<f64>) -> f64,
795 {
796 let problem_class = self.classify_problem(objective, initial_params);
797 self.problem_class_history.push_back(problem_class.clone());
798 if self.problem_class_history.len() > 100 {
799 self.problem_class_history.pop_front();
800 }
801
802 let initial_meta_context = Array1::from(vec![
803 self.meta_stats.curriculum_progress,
804 self.meta_stats.adaptation_efficiency,
805 self.curriculum_controller.difficulty_level,
806 ]);
807
808 let mut current_params = initial_params.to_owned();
809 let mut current_state = utils::create_state(current_params.clone(), objective, 0, None);
810 let mut experiences = Vec::new();
811 let mut momentum = Array1::zeros(initial_params.len());
812
813 let start_objective = current_state.objective_value;
814 let mut max_improvement = 0.0;
815 let mut exploration_steps = 0;
816
817 for step in 0..self.config.max_steps_per_episode {
818 let action = self.select_action(¤t_state);
820
821 let new_params =
823 utils::apply_action(¤t_state, &action, &self.best_params, &mut momentum);
824 let new_state =
825 utils::create_state(new_params, objective, step + 1, Some(¤t_state));
826
827 let base_reward =
829 self.reward_function
830 .compute_reward(¤t_state, &action, &new_state);
831 let exploration_bonus =
832 if matches!(action, OptimizationAction::RandomPerturbation { .. }) {
833 exploration_steps += 1;
834 0.01
835 } else {
836 0.0
837 };
838 let reward = base_reward + exploration_bonus;
839
840 let improvement = current_state.objective_value - new_state.objective_value;
842 if improvement > max_improvement {
843 max_improvement = improvement;
844 }
845
846 let experience = Experience {
848 state: current_state.clone(),
849 action: action.clone(),
850 reward,
851 next_state: new_state.clone(),
852 done: utils::should_terminate(&new_state, self.config.max_steps_per_episode),
853 };
854 experiences.push(experience);
855
856 if new_state.objective_value < self.best_objective {
858 self.best_objective = new_state.objective_value;
859 self.best_params = new_state.parameters.clone();
860 }
861
862 current_state = new_state;
863 current_params = current_state.parameters.clone();
864
865 if utils::should_terminate(¤t_state, self.config.max_steps_per_episode)
867 || matches!(action, OptimizationAction::Terminate)
868 {
869 break;
870 }
871 }
872
873 let final_objective = current_state.objective_value;
875 let total_improvement = start_objective - final_objective;
876 let learning_metrics = LearningMetrics {
877 improvement_rate: total_improvement / (current_state.step as f64 + 1.0),
878 convergence_speed: if total_improvement > 0.0 {
879 max_improvement / total_improvement
880 } else {
881 0.0
882 },
883 exploration_efficiency: (exploration_steps as f64) / (current_state.step as f64 + 1.0),
884 generalization_score: if total_improvement > 0.0 {
885 (total_improvement / start_objective.abs()).min(1.0)
886 } else {
887 0.0
888 },
889 };
890
891 let meta_trajectory = MetaTrajectory {
893 experiences,
894 problem_class: problem_class.clone(),
895 initial_meta_context,
896 learning_metrics: learning_metrics.clone(),
897 adaptation_speed: learning_metrics.improvement_rate.abs(),
898 };
899
900 self.meta_experience_buffer.add_trajectory(meta_trajectory);
902
903 let episode_performance = learning_metrics.generalization_score;
905 self.curriculum_controller
906 .update_progress(episode_performance);
907
908 Ok(OptimizeResults::<f64> {
909 x: current_params,
910 fun: current_state.objective_value,
911 success: current_state.convergence_metrics.relative_objective_change < 1e-6,
912 nit: current_state.step,
913 nfev: current_state.step, njev: 0,
915 nhev: 0,
916 maxcv: 0,
917 status: 0,
918 message: format!(
919 "Meta-policy gradient episode completed for problem class: {}",
920 problem_class
921 ),
922 jac: None,
923 hess: None,
924 constr: None,
925 })
926 }
927
928 fn train<F>(
929 &mut self,
930 objective: &F,
931 initial_params: &ArrayView1<f64>,
932 ) -> OptimizeResult<OptimizeResults<f64>>
933 where
934 F: Fn(&ArrayView1<f64>) -> f64,
935 {
936 let mut best_result = OptimizeResults::<f64> {
937 x: initial_params.to_owned(),
938 fun: f64::INFINITY,
939 success: false,
940 nit: 0,
941 nfev: 0,
942 njev: 0,
943 nhev: 0,
944 maxcv: 0,
945 status: 0,
946 message: "Meta-learning training not completed".to_string(),
947 jac: None,
948 hess: None,
949 constr: None,
950 };
951
952 for episode in 0..self.config.num_episodes {
954 let result = self.run_episode(objective, initial_params)?;
955
956 if result.fun < best_result.fun {
957 best_result = result;
958 }
959
960 if (episode + 1) % 5 == 0 && self.meta_experience_buffer.trajectories.len() >= 10 {
962 let meta_batch = self.meta_experience_buffer.sample_meta_batch(10);
963 let meta_gradients = self.compute_meta_gradients(&meta_batch);
964
965 self.meta_policy.meta_update(
967 &meta_gradients,
968 self.config.learning_rate,
969 self.config.learning_rate * 0.1,
970 );
971
972 let avg_performance = meta_batch
974 .iter()
975 .map(|t| t.learning_metrics.generalization_score)
976 .sum::<f64>()
977 / meta_batch.len() as f64;
978
979 if let Some(trajectory) = meta_batch.first() {
980 self.update_meta_stats(
981 &meta_gradients,
982 &trajectory.problem_class,
983 avg_performance,
984 );
985 }
986 }
987 }
988
989 best_result.x = self.best_params.clone();
990 best_result.fun = self.best_objective;
991 best_result.message = format!(
992 "Meta-learning training completed. Curriculum level: {:.3}, Adaptation efficiency: {:.3}",
993 self.meta_stats.curriculum_progress,
994 self.meta_stats.adaptation_efficiency
995 );
996
997 Ok(best_result)
998 }
999
1000 fn reset(&mut self) {
1001 self.meta_trajectories.clear();
1002 self.problem_class_history.clear();
1003 self.best_objective = f64::INFINITY;
1004 self.best_params.fill(0.0);
1005 self.meta_stats.meta_gradient_norms.clear();
1006 self.meta_stats.problem_class_performance.clear();
1007 self.curriculum_controller = CurriculumController::new();
1008 self.meta_experience_buffer = MetaExperienceBuffer::new(500);
1009 }
1010}
1011
1012#[allow(dead_code)]
1014pub fn advanced_advanced_policy_gradient_optimize<F>(
1015 objective: F,
1016 initial_params: &ArrayView1<f64>,
1017 config: Option<RLOptimizationConfig>,
1018) -> OptimizeResult<OptimizeResults<f64>>
1019where
1020 F: Fn(&ArrayView1<f64>) -> f64,
1021{
1022 let config = config.unwrap_or_else(|| RLOptimizationConfig {
1023 num_episodes: 100,
1024 max_steps_per_episode: 50,
1025 learning_rate: 0.001,
1026 ..Default::default()
1027 });
1028
1029 let mut optimizer = AdvancedAdvancedPolicyGradientOptimizer::new(
1030 config,
1031 initial_params.len() + 5, 6, );
1034 optimizer.train(&objective, initial_params)
1035}
1036
1037#[allow(dead_code)]
1039pub fn policy_gradient_optimize<F>(
1040 objective: F,
1041 initial_params: &ArrayView1<f64>,
1042 config: Option<RLOptimizationConfig>,
1043) -> OptimizeResult<OptimizeResults<f64>>
1044where
1045 F: Fn(&ArrayView1<f64>) -> f64,
1046{
1047 advanced_advanced_policy_gradient_optimize(objective, initial_params, config)
1048}
1049
1050#[cfg(test)]
1051mod tests {
1052 use super::*;
1053
1054 #[test]
1055 fn test_meta_policy_network_creation() {
1056 let network = MetaPolicyNetwork::new(4, 2, vec![8, 6]);
1057 assert_eq!(network.layer_sizes, vec![4, 8, 6, 2]);
1058 }
1059
1060 #[test]
1061 fn test_meta_forward_pass() {
1062 let mut network = MetaPolicyNetwork::new(3, 2, vec![4]);
1063 let input = Array1::from(vec![0.5, -0.3, 0.8]);
1064 let meta_context = Array1::from(vec![0.1, 0.2]);
1065
1066 let (policy_out, meta_out) = network.meta_forward(&input.view(), "test", &meta_context);
1067
1068 assert_eq!(policy_out.len(), 2);
1069 assert_eq!(meta_out.len(), 2);
1070 }
1071
1072 #[test]
1073 fn test_curriculum_controller() {
1074 let mut controller = CurriculumController::new();
1075 assert_eq!(controller.difficulty_level, 0.1);
1076
1077 for _ in 0..25 {
1079 controller.update_progress(0.9);
1080 }
1081
1082 assert!(controller.difficulty_level > 0.1);
1083 }
1084
1085 #[test]
1086 fn test_meta_experience_buffer() {
1087 let mut buffer = MetaExperienceBuffer::new(10);
1088
1089 let trajectory = MetaTrajectory {
1090 experiences: vec![],
1091 problem_class: "test".to_string(),
1092 initial_meta_context: Array1::zeros(3),
1093 learning_metrics: LearningMetrics {
1094 improvement_rate: 0.1,
1095 convergence_speed: 0.2,
1096 exploration_efficiency: 0.3,
1097 generalization_score: 0.4,
1098 },
1099 adaptation_speed: 0.1,
1100 };
1101
1102 buffer.add_trajectory(trajectory);
1103 assert_eq!(buffer.trajectories.len(), 1);
1104
1105 let batch = buffer.sample_meta_batch(1);
1106 assert_eq!(batch.len(), 1);
1107 }
1108
1109 #[test]
1110 fn test_advanced_advanced_optimizer_creation() {
1111 let config = RLOptimizationConfig::default();
1112 let optimizer = AdvancedAdvancedPolicyGradientOptimizer::new(config, 4, 3);
1113
1114 assert_eq!(optimizer.meta_policy.layer_sizes[0], 4);
1115 assert_eq!(optimizer.meta_policy.layer_sizes.last(), Some(&3));
1116 }
1117
1118 #[test]
1119 fn test_problem_classification() {
1120 let config = RLOptimizationConfig::default();
1121 let optimizer = AdvancedAdvancedPolicyGradientOptimizer::new(config, 2, 3);
1122
1123 let quadratic = |x: &ArrayView1<f64>| x[0].powi(2) + x[1].powi(2);
1124 let params = Array1::from(vec![1.0, 1.0]);
1125
1126 let class = optimizer.classify_problem(&quadratic, ¶ms.view());
1127 assert!(!class.is_empty());
1128 }
1129
1130 #[test]
1131 fn test_meta_optimization() {
1132 let config = RLOptimizationConfig {
1133 num_episodes: 50,
1134 max_steps_per_episode: 50,
1135 learning_rate: 0.05,
1136 ..Default::default()
1137 };
1138
1139 let objective = |x: &ArrayView1<f64>| (x[0] - 1.0).powi(2) + (x[1] + 0.5).powi(2);
1140 let initial = Array1::from(vec![0.0, 0.0]);
1141
1142 let result =
1143 advanced_advanced_policy_gradient_optimize(objective, &initial.view(), Some(config))
1144 .unwrap();
1145
1146 assert!(result.nit > 0);
1147 assert!(result.fun <= objective(&initial.view()) * 1.01);
1148 }
1149}
1150
1151#[allow(dead_code)]
1152pub fn placeholder() {
1153 }