1use super::{
11 utils, Experience, ImprovementReward, OptimizationAction, OptimizationState,
12 RLOptimizationConfig, RLOptimizer, RewardFunction,
13};
14use crate::error::{OptimizeError, OptimizeResult};
15use crate::result::OptimizeResults;
16use ndarray::{Array1, Array2, Array3, ArrayView1};
17use rand::{rng, Rng};
20use std::collections::{HashMap, VecDeque};
21
22#[derive(Debug, Clone)]
24pub struct MetaPolicyNetwork {
25 pub policy_weights: Array3<f64>, pub meta_weights: Array3<f64>,
29 pub policy_bias: Array2<f64>, pub meta_bias: Array2<f64>,
32 pub layer_sizes: Vec<usize>,
34 pub adaptive_learning_rates: Array2<f64>,
36 pub meta_gradient_accumulator: Array3<f64>,
38 pub second_order_info: Array3<f64>,
40 pub curriculum_difficulty: f64,
42 pub problem_embeddings: HashMap<String, Array1<f64>>,
44}
45
46impl MetaPolicyNetwork {
47 pub fn new(_input_size: usize, output_size: usize, hidden_sizes: Vec<usize>) -> Self {
49 let mut layer_sizes = vec![_input_size];
50 layer_sizes.extend(hidden_sizes);
51 layer_sizes.push(output_size);
52
53 let num_layers = layer_sizes.len() - 1;
54 let max_layer_size = *layer_sizes.iter().max().unwrap();
55
56 let mut policy_weights = Array3::zeros((num_layers, max_layer_size, max_layer_size));
58 let mut meta_weights = Array3::zeros((num_layers, max_layer_size, max_layer_size));
59
60 for layer in 0..num_layers {
61 let fan_in = layer_sizes[layer];
62 let fan_out = layer_sizes[layer + 1];
63 let xavier_std = (2.0 / (fan_in + fan_out) as f64).sqrt();
64
65 for i in 0..fan_out {
66 for j in 0..fan_in {
67 policy_weights[[layer, i, j]] =
68 rand::rng().random_range(-0.5..0.5) * 2.0 * xavier_std;
69 meta_weights[[layer, i, j]] =
70 rand::rng().random_range(-0.5..0.5) * 2.0 * xavier_std * 0.1;
71 }
72 }
73 }
74
75 Self {
76 policy_weights,
77 meta_weights,
78 policy_bias: Array2::zeros((num_layers, max_layer_size)),
79 meta_bias: Array2::zeros((num_layers, max_layer_size)),
80 layer_sizes,
81 adaptive_learning_rates: Array2::from_elem((num_layers, max_layer_size), 0.01),
82 meta_gradient_accumulator: Array3::zeros((num_layers, max_layer_size, max_layer_size)),
83 second_order_info: Array3::zeros((num_layers, max_layer_size, max_layer_size)),
84 curriculum_difficulty: 0.1,
85 problem_embeddings: HashMap::new(),
86 }
87 }
88
89 pub fn meta_forward(
91 &mut self,
92 state_features: &ArrayView1<f64>,
93 problem_class: &str,
94 meta_context: &Array1<f64>,
95 ) -> (Array1<f64>, Array1<f64>) {
96 let problem_embedding =
98 self.get_or_create_problem_embedding(problem_class, state_features.len());
99
100 let mut augmented_input = state_features.to_owned();
102
103 for (i, &emb) in problem_embedding.iter().enumerate() {
105 if i < augmented_input.len() {
106 augmented_input[i] += emb * 0.1;
107 }
108 }
109
110 let policy_output = self.forward_policy(&augmented_input.view());
112
113 let meta_output = self.forward_meta(&augmented_input.view(), meta_context);
115
116 (policy_output, meta_output)
117 }
118
119 fn forward_policy(&self, input: &ArrayView1<f64>) -> Array1<f64> {
120 let mut current_input = input.to_owned();
121
122 for layer in 0..(self.layer_sizes.len() - 1) {
123 let layer_input_size = self.layer_sizes[layer];
124 let layer_output_size = self.layer_sizes[layer + 1];
125
126 let mut layer_output = Array1::<f64>::zeros(layer_output_size);
127
128 for i in 0..layer_output_size {
129 for j in 0..layer_input_size.min(current_input.len()) {
130 layer_output[i] += self.policy_weights[[layer, i, j]] * current_input[j];
131 }
132 layer_output[i] += self.policy_bias[[layer, i]];
133
134 layer_output[i] = if layer_output[i] > 0.0 {
136 layer_output[i]
137 } else {
138 layer_output[i].exp() - 1.0
139 };
140 }
141
142 current_input = layer_output;
143 }
144
145 current_input
146 }
147
148 fn forward_meta(&self, input: &ArrayView1<f64>, metacontext: &Array1<f64>) -> Array1<f64> {
149 let mut meta_input = input.to_owned();
151 for (i, &ctx) in metacontext.iter().enumerate() {
152 if i < meta_input.len() {
153 meta_input[i] += ctx * 0.05;
154 }
155 }
156
157 let mut current_input = meta_input;
158
159 for layer in 0..(self.layer_sizes.len() - 1) {
160 let layer_input_size = self.layer_sizes[layer];
161 let layer_output_size = self.layer_sizes[layer + 1];
162
163 let mut layer_output = Array1::<f64>::zeros(layer_output_size);
164
165 for i in 0..layer_output_size {
166 for j in 0..layer_input_size.min(current_input.len()) {
167 layer_output[i] += self.meta_weights[[layer, i, j]] * current_input[j];
168 }
169 layer_output[i] += self.meta_bias[[layer, i]];
170
171 layer_output[i] = 1.0 / (1.0 + (-layer_output[i]).exp());
173 }
174
175 current_input = layer_output;
176 }
177
178 current_input
179 }
180
181 fn get_or_create_problem_embedding(
182 &mut self,
183 problem_class: &str,
184 input_size: usize,
185 ) -> Array1<f64> {
186 if let Some(embedding) = self.problem_embeddings.get(problem_class) {
187 embedding.clone()
188 } else {
189 let embedding =
190 Array1::from_shape_fn(input_size, |_| rand::rng().random_range(-0.05..0.05));
191 self.problem_embeddings
192 .insert(problem_class.to_string(), embedding.clone());
193 embedding
194 }
195 }
196
197 pub fn meta_update(
199 &mut self,
200 meta_gradients: &MetaGradients,
201 base_learning_rate: f64,
202 meta_learning_rate: f64,
203 ) {
204 for layer in 0..(self.layer_sizes.len() - 1) {
206 for i in 0..self.layer_sizes[layer + 1] {
207 for j in 0..self.layer_sizes[layer] {
208 let meta_grad = meta_gradients.meta_lr_gradients[[layer, i, j]];
210 self.adaptive_learning_rates[[layer, i]] *=
211 (1.0 + meta_learning_rate * meta_grad).max(0.1).min(10.0);
212
213 let adaptive_lr = self.adaptive_learning_rates[[layer, i]] * base_learning_rate;
215 self.policy_weights[[layer, i, j]] +=
216 adaptive_lr * meta_gradients.policy_gradients[[layer, i, j]];
217
218 self.meta_weights[[layer, i, j]] +=
220 meta_learning_rate * meta_gradients.meta_weight_gradients[[layer, i, j]];
221 }
222
223 let adaptive_lr = self.adaptive_learning_rates[[layer, i]] * base_learning_rate;
225 self.policy_bias[[layer, i]] +=
226 adaptive_lr * meta_gradients.policy_bias_gradients[[layer, i]];
227 self.meta_bias[[layer, i]] +=
228 meta_learning_rate * meta_gradients.meta_bias_gradients[[layer, i]];
229 }
230 }
231
232 self.update_curriculum_difficulty(&meta_gradients);
234 }
235
236 fn update_curriculum_difficulty(&mut self, metagradients: &MetaGradients) {
237 let gradient_norm = metagradients
238 .policy_gradients
239 .iter()
240 .map(|&g| g * g)
241 .sum::<f64>()
242 .sqrt();
243
244 if gradient_norm < 0.1 {
245 self.curriculum_difficulty = (self.curriculum_difficulty * 1.05).min(1.0);
246 } else if gradient_norm > 1.0 {
247 self.curriculum_difficulty = (self.curriculum_difficulty * 0.95).max(0.01);
248 }
249 }
250}
251
252#[derive(Debug, Clone)]
254pub struct MetaGradients {
255 pub policy_gradients: Array3<f64>,
257 pub meta_weight_gradients: Array3<f64>,
259 pub meta_lr_gradients: Array3<f64>,
261 pub policy_bias_gradients: Array2<f64>,
263 pub meta_bias_gradients: Array2<f64>,
264 pub second_order_terms: Array3<f64>,
266}
267
268#[derive(Debug, Clone)]
270pub struct AdvancedAdvancedPolicyGradientOptimizer {
271 config: RLOptimizationConfig,
273 meta_policy: MetaPolicyNetwork,
275 reward_function: ImprovementReward,
277 meta_trajectories: VecDeque<MetaTrajectory>,
279 problem_class_history: VecDeque<String>,
281 best_params: Array1<f64>,
283 best_objective: f64,
284 meta_stats: MetaLearningStats,
286 curriculum_controller: CurriculumController,
288 meta_experience_buffer: MetaExperienceBuffer,
290}
291
292#[derive(Debug, Clone)]
294pub struct MetaTrajectory {
295 pub experiences: Vec<Experience>,
297 pub problem_class: String,
299 pub initial_meta_context: Array1<f64>,
301 pub learning_metrics: LearningMetrics,
303 pub adaptation_speed: f64,
305}
306
307#[derive(Debug, Clone)]
309pub struct LearningMetrics {
310 pub improvement_rate: f64,
312 pub convergence_speed: f64,
314 pub exploration_efficiency: f64,
316 pub generalization_score: f64,
318}
319
320#[derive(Debug, Clone)]
322pub struct MetaLearningStats {
323 pub avg_learning_rates: Array1<f64>,
325 pub meta_gradient_norms: VecDeque<f64>,
327 pub problem_class_performance: HashMap<String, f64>,
329 pub curriculum_progress: f64,
331 pub adaptation_efficiency: f64,
333}
334
335#[derive(Debug, Clone)]
337pub struct CurriculumController {
338 pub difficulty_level: f64,
340 pub advancement_thresholds: Vec<f64>,
342 pub difficulty_generators: HashMap<String, f64>,
344 pub progress_tracker: VecDeque<f64>,
346}
347
348impl CurriculumController {
349 pub fn new() -> Self {
350 Self {
351 difficulty_level: 0.1,
352 advancement_thresholds: vec![0.8, 0.85, 0.9, 0.95],
353 difficulty_generators: HashMap::new(),
354 progress_tracker: VecDeque::with_capacity(100),
355 }
356 }
357
358 pub fn should_advance(&self) -> bool {
359 if self.progress_tracker.len() < 20 {
360 return false;
361 }
362
363 let recent_performance: f64 =
364 self.progress_tracker.iter().rev().take(20).sum::<f64>() / 20.0;
365
366 let threshold_idx = ((self.difficulty_level * 4.0) as usize).min(3);
367 recent_performance > self.advancement_thresholds[threshold_idx]
368 }
369
370 pub fn advance_difficulty(&mut self) {
371 self.difficulty_level = (self.difficulty_level * 1.2).min(1.0);
372 }
373
374 pub fn update_progress(&mut self, performance: f64) {
375 self.progress_tracker.push_back(performance);
376 if self.progress_tracker.len() > 100 {
377 self.progress_tracker.pop_front();
378 }
379
380 if self.should_advance() {
381 self.advance_difficulty();
382 }
383 }
384}
385
386#[derive(Debug, Clone)]
388pub struct MetaExperienceBuffer {
389 pub trajectories: VecDeque<MetaTrajectory>,
391 pub max_size: usize,
393 pub class_weights: HashMap<String, f64>,
395}
396
397impl MetaExperienceBuffer {
398 pub fn new(_maxsize: usize) -> Self {
399 Self {
400 trajectories: VecDeque::with_capacity(_maxsize),
401 max_size: _maxsize,
402 class_weights: HashMap::new(),
403 }
404 }
405
406 pub fn add_trajectory(&mut self, trajectory: MetaTrajectory) {
407 let avg_reward = trajectory.experiences.iter().map(|e| e.reward).sum::<f64>()
409 / trajectory.experiences.len().max(1) as f64;
410
411 *self
412 .class_weights
413 .entry(trajectory.problem_class.clone())
414 .or_insert(1.0) *= if avg_reward > 0.0 { 1.05 } else { 0.95 };
415
416 self.trajectories.push_back(trajectory);
417 if self.trajectories.len() > self.max_size {
418 self.trajectories.pop_front();
419 }
420 }
421
422 pub fn sample_meta_batch(&self, batchsize: usize) -> Vec<MetaTrajectory> {
423 let mut batch = Vec::new();
424
425 for _ in 0..batchsize.min(self.trajectories.len()) {
426 let idx = rand::rng().random_range(0..self.trajectories.len());
428 if let Some(trajectory) = self.trajectories.get(idx) {
429 batch.push(trajectory.clone());
430 }
431 }
432
433 batch
434 }
435}
436
437impl AdvancedAdvancedPolicyGradientOptimizer {
438 pub fn new(config: RLOptimizationConfig, state_size: usize, actionsize: usize) -> Self {
440 let hidden_sizes = vec![state_size * 2, state_size * 3, state_size * 2];
441 let meta_policy = MetaPolicyNetwork::new(state_size, actionsize, hidden_sizes);
442
443 Self {
444 config,
445 meta_policy,
446 reward_function: ImprovementReward::default(),
447 meta_trajectories: VecDeque::with_capacity(1000),
448 problem_class_history: VecDeque::with_capacity(100),
449 best_params: Array1::zeros(state_size),
450 best_objective: f64::INFINITY,
451 meta_stats: MetaLearningStats {
452 avg_learning_rates: Array1::zeros(state_size),
453 meta_gradient_norms: VecDeque::with_capacity(1000),
454 problem_class_performance: HashMap::new(),
455 curriculum_progress: 0.0,
456 adaptation_efficiency: 1.0,
457 },
458 curriculum_controller: CurriculumController::new(),
459 meta_experience_buffer: MetaExperienceBuffer::new(500),
460 }
461 }
462
463 fn extract_meta_state_features(
465 &self,
466 state: &OptimizationState,
467 problem_class: &str,
468 ) -> (Array1<f64>, Array1<f64>) {
469 let mut base_features = Vec::new();
470
471 for ¶m in state.parameters.iter() {
473 base_features.push(param.tanh());
474 }
475
476 base_features.push((state.objective_value / (state.objective_value.abs() + 1.0)).tanh());
478 base_features.push(
479 state
480 .convergence_metrics
481 .relative_objective_change
482 .ln()
483 .max(-10.0)
484 .tanh(),
485 );
486 base_features.push(state.convergence_metrics.parameter_change_norm.tanh());
487
488 base_features.push((state.step as f64 / 100.0).tanh());
490
491 let problem_difficulty = self.meta_policy.curriculum_difficulty;
493 base_features.push(problem_difficulty);
494
495 let mut meta_context = Vec::new();
497
498 let class_performance = self
500 .meta_stats
501 .problem_class_performance
502 .get(problem_class)
503 .copied()
504 .unwrap_or(0.0);
505 meta_context.push(class_performance);
506
507 let recent_meta_grad_norm = self
509 .meta_stats
510 .meta_gradient_norms
511 .iter()
512 .rev()
513 .take(10)
514 .sum::<f64>()
515 / 10.0;
516 meta_context.push(recent_meta_grad_norm.tanh());
517
518 meta_context.push(self.meta_stats.curriculum_progress);
520
521 meta_context.push(self.meta_stats.adaptation_efficiency);
523
524 let recent_classes: std::collections::HashSet<String> = self
526 .problem_class_history
527 .iter()
528 .rev()
529 .take(10)
530 .cloned()
531 .collect();
532 meta_context.push((recent_classes.len() as f64 / 10.0).min(1.0));
533
534 (Array1::from(base_features), Array1::from(meta_context))
535 }
536
537 fn decode_meta_action(
539 &self,
540 policy_output: &ArrayView1<f64>,
541 meta_output: &ArrayView1<f64>,
542 ) -> OptimizationAction {
543 if policy_output.is_empty() {
544 return OptimizationAction::GradientStep {
545 learning_rate: 0.01,
546 };
547 }
548
549 let meta_modulation = meta_output.get(0).copied().unwrap_or(1.0);
551 let action_strength = meta_output.get(1).copied().unwrap_or(1.0);
552
553 let action_logits = policy_output.mapv(|x| x * meta_modulation);
555 let action_type = action_logits
556 .iter()
557 .enumerate()
558 .max_by(|a, b| a.1.partial_cmp(b.1).unwrap_or(std::cmp::Ordering::Equal))
559 .map(|(idx, _)| idx)
560 .unwrap_or(0);
561
562 match action_type {
563 0 => OptimizationAction::GradientStep {
564 learning_rate: 0.01 * action_strength * (1.0 + policy_output[0] * 0.5),
565 },
566 1 => OptimizationAction::RandomPerturbation {
567 magnitude: 0.1 * action_strength * (1.0 + policy_output[1] * 0.5),
568 },
569 2 => OptimizationAction::MomentumUpdate {
570 momentum: (0.9 * action_strength * (1.0 + policy_output[2] * 0.1)).min(0.99),
571 },
572 3 => OptimizationAction::AdaptiveLearningRate {
573 factor: (0.5 + 0.5 * policy_output[3] * action_strength)
574 .max(0.1)
575 .min(2.0),
576 },
577 4 => OptimizationAction::ResetToBest,
578 _ => OptimizationAction::Terminate,
579 }
580 }
581
582 fn compute_meta_gradients(&self, metabatch: &[MetaTrajectory]) -> MetaGradients {
584 let num_layers = self.meta_policy.layer_sizes.len() - 1;
585 let max_size = *self.meta_policy.layer_sizes.iter().max().unwrap();
586
587 let mut meta_gradients = MetaGradients {
588 policy_gradients: Array3::zeros((num_layers, max_size, max_size)),
589 meta_weight_gradients: Array3::zeros((num_layers, max_size, max_size)),
590 meta_lr_gradients: Array3::zeros((num_layers, max_size, max_size)),
591 policy_bias_gradients: Array2::zeros((num_layers, max_size)),
592 meta_bias_gradients: Array2::zeros((num_layers, max_size)),
593 second_order_terms: Array3::zeros((num_layers, max_size, max_size)),
594 };
595
596 for trajectory in metabatch {
597 let trajectory_return: f64 = trajectory.experiences.iter().map(|e| e.reward).sum();
599
600 let learning_speed_bonus = trajectory.learning_metrics.convergence_speed * 0.1;
601 let exploration_bonus = trajectory.learning_metrics.exploration_efficiency * 0.05;
602 let adjusted_return = trajectory_return + learning_speed_bonus + exploration_bonus;
603
604 for (step, experience) in trajectory.experiences.iter().enumerate() {
606 let (state_features, meta_context) =
607 self.extract_meta_state_features(&experience.state, &trajectory.problem_class);
608
609 let gamma = self.config.discount_factor;
611 let step_return: f64 = trajectory.experiences[step..]
612 .iter()
613 .enumerate()
614 .map(|(i, e)| gamma.powi(i as i32) * e.reward)
615 .sum();
616
617 let advantage = step_return - adjusted_return / trajectory.experiences.len() as f64;
619
620 for layer in 0..num_layers {
622 for i in 0..self.meta_policy.layer_sizes[layer + 1] {
623 for j in 0..self.meta_policy.layer_sizes[layer] {
624 if j < state_features.len() {
625 meta_gradients.policy_gradients[[layer, i, j]] +=
627 advantage * state_features[j] * 0.01;
628
629 let meta_lr_grad = advantage
631 * state_features[j]
632 * trajectory.learning_metrics.convergence_speed;
633 meta_gradients.meta_lr_gradients[[layer, i, j]] +=
634 meta_lr_grad * 0.001;
635
636 if j < meta_context.len() {
638 meta_gradients.meta_weight_gradients[[layer, i, j]] +=
639 advantage * meta_context[j] * 0.001;
640 }
641 }
642 }
643
644 meta_gradients.policy_bias_gradients[[layer, i]] += advantage * 0.01;
646 meta_gradients.meta_bias_gradients[[layer, i]] +=
647 advantage * trajectory.learning_metrics.generalization_score * 0.001;
648 }
649 }
650 }
651 }
652
653 if !metabatch.is_empty() {
655 let batch_size = metabatch.len() as f64;
656 meta_gradients.policy_gradients /= batch_size;
657 meta_gradients.meta_weight_gradients /= batch_size;
658 meta_gradients.meta_lr_gradients /= batch_size;
659 meta_gradients.policy_bias_gradients /= batch_size;
660 meta_gradients.meta_bias_gradients /= batch_size;
661 }
662
663 meta_gradients
664 }
665
666 fn update_meta_stats(
668 &mut self,
669 meta_gradients: &MetaGradients,
670 problem_class: &str,
671 performance: f64,
672 ) {
673 let grad_norm = meta_gradients
675 .policy_gradients
676 .iter()
677 .map(|&g| g * g)
678 .sum::<f64>()
679 .sqrt();
680 self.meta_stats.meta_gradient_norms.push_back(grad_norm);
681 if self.meta_stats.meta_gradient_norms.len() > 1000 {
682 self.meta_stats.meta_gradient_norms.pop_front();
683 }
684
685 let current_perf = self
687 .meta_stats
688 .problem_class_performance
689 .entry(problem_class.to_string())
690 .or_insert(0.0);
691 *current_perf = 0.9 * *current_perf + 0.1 * performance;
692
693 self.meta_stats.curriculum_progress = self.curriculum_controller.difficulty_level;
695
696 let grad_stability = if self.meta_stats.meta_gradient_norms.len() > 10 {
698 let recent_grads: Vec<f64> = self
699 .meta_stats
700 .meta_gradient_norms
701 .iter()
702 .rev()
703 .take(10)
704 .cloned()
705 .collect();
706 let mean = recent_grads.iter().sum::<f64>() / recent_grads.len() as f64;
707 let variance = recent_grads
708 .iter()
709 .map(|&x| (x - mean).powi(2))
710 .sum::<f64>()
711 / recent_grads.len() as f64;
712 1.0 / (1.0 + variance)
713 } else {
714 1.0
715 };
716
717 self.meta_stats.adaptation_efficiency =
718 0.95 * self.meta_stats.adaptation_efficiency + 0.05 * grad_stability;
719 }
720
721 fn classify_problem<F>(&self, objective: &F, params: &ArrayView1<f64>) -> String
723 where
724 F: Fn(&ArrayView1<f64>) -> f64,
725 {
726 let base_value = objective(params);
728
729 let eps = 1e-6;
731 let mut curvature_sum = 0.0;
732
733 for i in 0..params.len().min(3) {
734 let mut params_plus = params.to_owned();
736 let mut params_minus = params.to_owned();
737 params_plus[i] += eps;
738 params_minus[i] -= eps;
739
740 let f_plus = objective(¶ms_plus.view());
741 let f_minus = objective(¶ms_minus.view());
742 let curvature = (f_plus + f_minus - 2.0 * base_value) / (eps * eps);
743 curvature_sum += curvature;
744 }
745
746 let avg_curvature = curvature_sum / params.len().min(3) as f64;
747
748 if avg_curvature > 1.0 {
749 "convex".to_string()
750 } else if avg_curvature < -1.0 {
751 "concave".to_string()
752 } else if base_value.abs() < 1.0 {
753 "low_scale".to_string()
754 } else if base_value.abs() > 100.0 {
755 "high_scale".to_string()
756 } else {
757 "general".to_string()
758 }
759 }
760}
761
762impl RLOptimizer for AdvancedAdvancedPolicyGradientOptimizer {
763 fn config(&self) -> &RLOptimizationConfig {
764 &self.config
765 }
766
767 fn select_action(&mut self, state: &OptimizationState) -> OptimizationAction {
768 let problem_class = "general"; let (state_features, meta_context) = self.extract_meta_state_features(state, problem_class);
770 let (policy_output, meta_output) =
771 self.meta_policy
772 .meta_forward(&state_features.view(), problem_class, &meta_context);
773 self.decode_meta_action(&policy_output.view(), &meta_output.view())
774 }
775
776 fn update(&mut self, experience: &Experience) -> Result<(), OptimizeError> {
777 Ok(())
779 }
780
781 fn run_episode<F>(
782 &mut self,
783 objective: &F,
784 initial_params: &ArrayView1<f64>,
785 ) -> OptimizeResult<OptimizeResults<f64>>
786 where
787 F: Fn(&ArrayView1<f64>) -> f64,
788 {
789 let problem_class = self.classify_problem(objective, initial_params);
790 self.problem_class_history.push_back(problem_class.clone());
791 if self.problem_class_history.len() > 100 {
792 self.problem_class_history.pop_front();
793 }
794
795 let initial_meta_context = Array1::from(vec![
796 self.meta_stats.curriculum_progress,
797 self.meta_stats.adaptation_efficiency,
798 self.curriculum_controller.difficulty_level,
799 ]);
800
801 let mut current_params = initial_params.to_owned();
802 let mut current_state = utils::create_state(current_params.clone(), objective, 0, None);
803 let mut experiences = Vec::new();
804 let mut momentum = Array1::zeros(initial_params.len());
805
806 let start_objective = current_state.objective_value;
807 let mut max_improvement = 0.0;
808 let mut exploration_steps = 0;
809
810 for step in 0..self.config.max_steps_per_episode {
811 let action = self.select_action(¤t_state);
813
814 let new_params =
816 utils::apply_action(¤t_state, &action, &self.best_params, &mut momentum);
817 let new_state =
818 utils::create_state(new_params, objective, step + 1, Some(¤t_state));
819
820 let base_reward =
822 self.reward_function
823 .compute_reward(¤t_state, &action, &new_state);
824 let exploration_bonus =
825 if matches!(action, OptimizationAction::RandomPerturbation { .. }) {
826 exploration_steps += 1;
827 0.01
828 } else {
829 0.0
830 };
831 let reward = base_reward + exploration_bonus;
832
833 let improvement = current_state.objective_value - new_state.objective_value;
835 if improvement > max_improvement {
836 max_improvement = improvement;
837 }
838
839 let experience = Experience {
841 state: current_state.clone(),
842 action: action.clone(),
843 reward,
844 next_state: new_state.clone(),
845 done: utils::should_terminate(&new_state, self.config.max_steps_per_episode),
846 };
847 experiences.push(experience);
848
849 if new_state.objective_value < self.best_objective {
851 self.best_objective = new_state.objective_value;
852 self.best_params = new_state.parameters.clone();
853 }
854
855 current_state = new_state;
856 current_params = current_state.parameters.clone();
857
858 if utils::should_terminate(¤t_state, self.config.max_steps_per_episode)
860 || matches!(action, OptimizationAction::Terminate)
861 {
862 break;
863 }
864 }
865
866 let final_objective = current_state.objective_value;
868 let total_improvement = start_objective - final_objective;
869 let learning_metrics = LearningMetrics {
870 improvement_rate: total_improvement / (current_state.step as f64 + 1.0),
871 convergence_speed: if total_improvement > 0.0 {
872 max_improvement / total_improvement
873 } else {
874 0.0
875 },
876 exploration_efficiency: (exploration_steps as f64) / (current_state.step as f64 + 1.0),
877 generalization_score: if total_improvement > 0.0 {
878 (total_improvement / start_objective.abs()).min(1.0)
879 } else {
880 0.0
881 },
882 };
883
884 let meta_trajectory = MetaTrajectory {
886 experiences,
887 problem_class: problem_class.clone(),
888 initial_meta_context,
889 learning_metrics: learning_metrics.clone(),
890 adaptation_speed: learning_metrics.improvement_rate.abs(),
891 };
892
893 self.meta_experience_buffer.add_trajectory(meta_trajectory);
895
896 let episode_performance = learning_metrics.generalization_score;
898 self.curriculum_controller
899 .update_progress(episode_performance);
900
901 Ok(OptimizeResults::<f64> {
902 x: current_params,
903 fun: current_state.objective_value,
904 success: current_state.convergence_metrics.relative_objective_change < 1e-6,
905 nit: current_state.step,
906 nfev: current_state.step, njev: 0,
908 nhev: 0,
909 maxcv: 0,
910 status: 0,
911 message: format!(
912 "Meta-policy gradient episode completed for problem class: {}",
913 problem_class
914 ),
915 jac: None,
916 hess: None,
917 constr: None,
918 })
919 }
920
921 fn train<F>(
922 &mut self,
923 objective: &F,
924 initial_params: &ArrayView1<f64>,
925 ) -> OptimizeResult<OptimizeResults<f64>>
926 where
927 F: Fn(&ArrayView1<f64>) -> f64,
928 {
929 let mut best_result = OptimizeResults::<f64> {
930 x: initial_params.to_owned(),
931 fun: f64::INFINITY,
932 success: false,
933 nit: 0,
934 nfev: 0,
935 njev: 0,
936 nhev: 0,
937 maxcv: 0,
938 status: 0,
939 message: "Meta-learning training not completed".to_string(),
940 jac: None,
941 hess: None,
942 constr: None,
943 };
944
945 for episode in 0..self.config.num_episodes {
947 let result = self.run_episode(objective, initial_params)?;
948
949 if result.fun < best_result.fun {
950 best_result = result;
951 }
952
953 if (episode + 1) % 5 == 0 && self.meta_experience_buffer.trajectories.len() >= 10 {
955 let meta_batch = self.meta_experience_buffer.sample_meta_batch(10);
956 let meta_gradients = self.compute_meta_gradients(&meta_batch);
957
958 self.meta_policy.meta_update(
960 &meta_gradients,
961 self.config.learning_rate,
962 self.config.learning_rate * 0.1,
963 );
964
965 let avg_performance = meta_batch
967 .iter()
968 .map(|t| t.learning_metrics.generalization_score)
969 .sum::<f64>()
970 / meta_batch.len() as f64;
971
972 if let Some(trajectory) = meta_batch.first() {
973 self.update_meta_stats(
974 &meta_gradients,
975 &trajectory.problem_class,
976 avg_performance,
977 );
978 }
979 }
980 }
981
982 best_result.x = self.best_params.clone();
983 best_result.fun = self.best_objective;
984 best_result.message = format!(
985 "Meta-learning training completed. Curriculum level: {:.3}, Adaptation efficiency: {:.3}",
986 self.meta_stats.curriculum_progress,
987 self.meta_stats.adaptation_efficiency
988 );
989
990 Ok(best_result)
991 }
992
993 fn reset(&mut self) {
994 self.meta_trajectories.clear();
995 self.problem_class_history.clear();
996 self.best_objective = f64::INFINITY;
997 self.best_params.fill(0.0);
998 self.meta_stats.meta_gradient_norms.clear();
999 self.meta_stats.problem_class_performance.clear();
1000 self.curriculum_controller = CurriculumController::new();
1001 self.meta_experience_buffer = MetaExperienceBuffer::new(500);
1002 }
1003}
1004
1005#[allow(dead_code)]
1007pub fn advanced_advanced_policy_gradient_optimize<F>(
1008 objective: F,
1009 initial_params: &ArrayView1<f64>,
1010 config: Option<RLOptimizationConfig>,
1011) -> OptimizeResult<OptimizeResults<f64>>
1012where
1013 F: Fn(&ArrayView1<f64>) -> f64,
1014{
1015 let config = config.unwrap_or_else(|| RLOptimizationConfig {
1016 num_episodes: 100,
1017 max_steps_per_episode: 50,
1018 learning_rate: 0.001,
1019 ..Default::default()
1020 });
1021
1022 let mut optimizer = AdvancedAdvancedPolicyGradientOptimizer::new(
1023 config,
1024 initial_params.len() + 5, 6, );
1027 optimizer.train(&objective, initial_params)
1028}
1029
1030#[allow(dead_code)]
1032pub fn policy_gradient_optimize<F>(
1033 objective: F,
1034 initial_params: &ArrayView1<f64>,
1035 config: Option<RLOptimizationConfig>,
1036) -> OptimizeResult<OptimizeResults<f64>>
1037where
1038 F: Fn(&ArrayView1<f64>) -> f64,
1039{
1040 advanced_advanced_policy_gradient_optimize(objective, initial_params, config)
1041}
1042
1043#[cfg(test)]
1044mod tests {
1045 use super::*;
1046
1047 #[test]
1048 fn test_meta_policy_network_creation() {
1049 let network = MetaPolicyNetwork::new(4, 2, vec![8, 6]);
1050 assert_eq!(network.layer_sizes, vec![4, 8, 6, 2]);
1051 }
1052
1053 #[test]
1054 fn test_meta_forward_pass() {
1055 let mut network = MetaPolicyNetwork::new(3, 2, vec![4]);
1056 let input = Array1::from(vec![0.5, -0.3, 0.8]);
1057 let meta_context = Array1::from(vec![0.1, 0.2]);
1058
1059 let (policy_out, meta_out) = network.meta_forward(&input.view(), "test", &meta_context);
1060
1061 assert_eq!(policy_out.len(), 2);
1062 assert_eq!(meta_out.len(), 2);
1063 }
1064
1065 #[test]
1066 fn test_curriculum_controller() {
1067 let mut controller = CurriculumController::new();
1068 assert_eq!(controller.difficulty_level, 0.1);
1069
1070 for _ in 0..25 {
1072 controller.update_progress(0.9);
1073 }
1074
1075 assert!(controller.difficulty_level > 0.1);
1076 }
1077
1078 #[test]
1079 fn test_meta_experience_buffer() {
1080 let mut buffer = MetaExperienceBuffer::new(10);
1081
1082 let trajectory = MetaTrajectory {
1083 experiences: vec![],
1084 problem_class: "test".to_string(),
1085 initial_meta_context: Array1::zeros(3),
1086 learning_metrics: LearningMetrics {
1087 improvement_rate: 0.1,
1088 convergence_speed: 0.2,
1089 exploration_efficiency: 0.3,
1090 generalization_score: 0.4,
1091 },
1092 adaptation_speed: 0.1,
1093 };
1094
1095 buffer.add_trajectory(trajectory);
1096 assert_eq!(buffer.trajectories.len(), 1);
1097
1098 let batch = buffer.sample_meta_batch(1);
1099 assert_eq!(batch.len(), 1);
1100 }
1101
1102 #[test]
1103 fn test_advanced_advanced_optimizer_creation() {
1104 let config = RLOptimizationConfig::default();
1105 let optimizer = AdvancedAdvancedPolicyGradientOptimizer::new(config, 4, 3);
1106
1107 assert_eq!(optimizer.meta_policy.layer_sizes[0], 4);
1108 assert_eq!(optimizer.meta_policy.layer_sizes.last(), Some(&3));
1109 }
1110
1111 #[test]
1112 fn test_problem_classification() {
1113 let config = RLOptimizationConfig::default();
1114 let optimizer = AdvancedAdvancedPolicyGradientOptimizer::new(config, 2, 3);
1115
1116 let quadratic = |x: &ArrayView1<f64>| x[0].powi(2) + x[1].powi(2);
1117 let params = Array1::from(vec![1.0, 1.0]);
1118
1119 let class = optimizer.classify_problem(&quadratic, ¶ms.view());
1120 assert!(!class.is_empty());
1121 }
1122
1123 #[test]
1124 fn test_meta_optimization() {
1125 let config = RLOptimizationConfig {
1126 num_episodes: 5,
1127 max_steps_per_episode: 10,
1128 learning_rate: 0.1,
1129 ..Default::default()
1130 };
1131
1132 let objective = |x: &ArrayView1<f64>| (x[0] - 1.0).powi(2) + (x[1] + 0.5).powi(2);
1133 let initial = Array1::from(vec![0.0, 0.0]);
1134
1135 let result =
1136 advanced_advanced_policy_gradient_optimize(objective, &initial.view(), Some(config))
1137 .unwrap();
1138
1139 assert!(result.nit > 0);
1140 assert!(result.fun <= objective(&initial.view()));
1141 }
1142}
1143
1144#[allow(dead_code)]
1145pub fn placeholder() {
1146 }