use super::{
ActivationType, LearnedOptimizationConfig, LearnedOptimizer, MetaOptimizerState,
OptimizationProblem, TrainingTask,
};
use crate::error::{OptimizeError, OptimizeResult};
use crate::result::OptimizeResults;
use scirs2_core::ndarray::{Array1, Array2, ArrayView1, ArrayView2};
use scirs2_core::random::{Rng, RngExt};
use statrs::statistics::Statistics;
use std::collections::{HashMap, VecDeque};
#[derive(Debug, Clone)]
pub struct AdaptiveTransformerOptimizer {
config: LearnedOptimizationConfig,
transformer: OptimizationTransformer,
problem_encoder: TransformerProblemEncoder,
history_buffer: OptimizationHistory,
meta_state: MetaOptimizerState,
adaptive_components: AdaptiveComponents,
performance_metrics: TransformerMetrics,
}
#[derive(Debug, Clone)]
pub struct OptimizationTransformer {
num_layers: usize,
transformer_blocks: Vec<TransformerBlock>,
position_encoding: Array2<f64>,
input_embedding: Array2<f64>,
output_projection: Array2<f64>,
model_dim: usize,
}
#[derive(Debug, Clone)]
pub struct TransformerBlock {
attention: MultiHeadAttention,
feed_forward: FeedForwardNetwork,
layer_norm1: LayerNormalization,
layer_norm2: LayerNormalization,
dropout_rate: f64,
}
#[derive(Debug, Clone)]
pub struct MultiHeadAttention {
num_heads: usize,
head_dim: usize,
w_query: Array2<f64>,
w_key: Array2<f64>,
w_value: Array2<f64>,
w_output: Array2<f64>,
attention_scores: Vec<Array2<f64>>,
}
#[derive(Debug, Clone)]
pub struct FeedForwardNetwork {
linear1: Array2<f64>,
linear2: Array2<f64>,
bias1: Array1<f64>,
bias2: Array1<f64>,
activation: ActivationType,
hidden_dim: usize,
}
#[derive(Debug, Clone)]
pub struct LayerNormalization {
gamma: Array1<f64>,
beta: Array1<f64>,
epsilon: f64,
}
#[derive(Debug, Clone)]
pub struct TransformerProblemEncoder {
gradient_encoder: Array2<f64>,
hessian_encoder: Array2<f64>,
parameter_encoder: Array2<f64>,
temporal_encoder: Array2<f64>,
context_encoder: Array2<f64>,
embedding_dim: usize,
}
#[derive(Debug, Clone)]
pub struct OptimizationHistory {
parameter_history: VecDeque<Array1<f64>>,
objective_history: VecDeque<f64>,
gradient_history: VecDeque<Array1<f64>>,
step_size_history: VecDeque<f64>,
success_history: VecDeque<bool>,
max_length: usize,
current_step: usize,
}
#[derive(Debug, Clone)]
pub struct AdaptiveComponents {
attention_adaptation: AttentionAdaptation,
learning_rate_adapter: LearningRateAdapter,
gradient_scaler: GradientScaler,
step_size_predictor: StepSizePredictor,
convergence_detector: ConvergenceDetector,
}
#[derive(Debug, Clone)]
pub struct AttentionAdaptation {
adaptation_rate: f64,
attention_focus: Array1<f64>,
focus_history: VecDeque<Array1<f64>>,
problem_patterns: HashMap<String, Array1<f64>>,
}
#[derive(Debug, Clone)]
pub struct LearningRateAdapter {
base_lr: f64,
current_lr: f64,
adaptation_params: Array1<f64>,
performance_window: VecDeque<f64>,
lr_history: Vec<f64>,
}
#[derive(Debug, Clone)]
pub struct GradientScaler {
scale_factors: Array1<f64>,
gradient_stats: GradientStatistics,
scaling_params: Array1<f64>,
}
#[derive(Debug, Clone)]
pub struct GradientStatistics {
mean: Array1<f64>,
variance: Array1<f64>,
count: usize,
momentum: f64,
}
#[derive(Debug, Clone)]
pub struct StepSizePredictor {
predictor_network: Array2<f64>,
feature_dim: usize,
prediction_history: Vec<f64>,
actual_steps: Vec<f64>,
}
#[derive(Debug, Clone)]
pub struct ConvergenceDetector {
threshold: f64,
window_size: usize,
recent_improvements: VecDeque<f64>,
convergence_prob: f64,
}
#[derive(Debug, Clone)]
pub struct TransformerMetrics {
attention_entropy: f64,
lr_adaptation_efficiency: f64,
gradient_prediction_accuracy: f64,
step_size_prediction_accuracy: f64,
convergence_detection_accuracy: f64,
}
impl AdaptiveTransformerOptimizer {
pub fn new(config: LearnedOptimizationConfig) -> Self {
let model_dim = config.hidden_size;
let transformer = OptimizationTransformer::new(
config.num_heads,
model_dim,
config.max_parameters,
6, );
let problem_encoder = TransformerProblemEncoder::new(model_dim);
let history_buffer = OptimizationHistory::new(100);
Self {
config,
transformer,
problem_encoder,
history_buffer,
meta_state: MetaOptimizerState {
meta_params: Array1::zeros(model_dim),
network_weights: Array2::zeros((model_dim, model_dim)),
performance_history: Vec::new(),
adaptation_stats: super::AdaptationStatistics::default(),
episode: 0,
},
adaptive_components: AdaptiveComponents::new(model_dim),
performance_metrics: TransformerMetrics::default(),
}
}
pub fn process_optimization_step<F>(
&mut self,
objective: &F,
current_params: &ArrayView1<f64>,
problem: &OptimizationProblem,
) -> OptimizeResult<OptimizationStep>
where
F: Fn(&ArrayView1<f64>) -> f64,
{
let state_encoding = self.encode_optimization_state(objective, current_params, problem)?;
let transformer_output = self.transformer.forward(&state_encoding.view())?;
let optimization_step = self.decode_optimization_step(&transformer_output.view())?;
self.update_adaptive_components(&optimization_step)?;
self.history_buffer.add_step(
current_params.to_owned(),
objective(current_params),
optimization_step.clone(),
);
Ok(optimization_step)
}
fn encode_optimization_state<F>(
&self,
objective: &F,
current_params: &ArrayView1<f64>,
problem: &OptimizationProblem,
) -> OptimizeResult<Array2<f64>>
where
F: Fn(&ArrayView1<f64>) -> f64,
{
let seq_len = self.history_buffer.current_step.min(50) + 1; let model_dim = self.transformer.model_dim;
let mut sequence = Array2::zeros((seq_len, model_dim));
for i in 0..seq_len - 1 {
if let Some(historical_encoding) = self.encode_historical_state(i) {
for j in 0..model_dim.min(historical_encoding.len()) {
sequence[[i, j]] = historical_encoding[j];
}
}
}
let current_encoding =
self.problem_encoder
.encode_current_state(objective, current_params, problem)?;
let last_idx = seq_len - 1;
for j in 0..model_dim.min(current_encoding.len()) {
sequence[[last_idx, j]] = current_encoding[j];
}
Ok(sequence)
}
fn encode_historical_state(&self, history_index: usize) -> Option<Array1<f64>> {
if history_index >= self.history_buffer.parameter_history.len() {
return None;
}
let params = &self.history_buffer.parameter_history[history_index];
let obj_val = self.history_buffer.objective_history[history_index];
let mut encoding = Array1::zeros(self.transformer.model_dim);
for (i, ¶m) in params.iter().enumerate() {
if i < encoding.len() / 4 {
encoding[i] = param.tanh();
}
}
let obj_idx = encoding.len() / 4;
if obj_idx < encoding.len() {
encoding[obj_idx] = obj_val.ln().abs().tanh();
}
if let Some(gradient) = self.history_buffer.gradient_history.get(history_index) {
let grad_start = encoding.len() / 2;
for (i, &grad) in gradient.iter().enumerate() {
if grad_start + i < encoding.len() {
encoding[grad_start + i] = grad.tanh();
}
}
}
Some(encoding)
}
fn decode_optimization_step(
&self,
transformer_output: &ArrayView2<f64>,
) -> OptimizeResult<OptimizationStep> {
if transformer_output.is_empty() {
return Err(OptimizeError::InvalidInput(
"Empty transformer _output".to_string(),
));
}
let last_output = transformer_output.row(transformer_output.nrows() - 1);
let step_size_raw = last_output.get(0).copied().unwrap_or(0.0);
let step_size = (step_size_raw.tanh() + 1.0) * 0.01;
let direction_dim = self.meta_state.meta_params.len().min(last_output.len() - 1);
let mut direction = Array1::zeros(direction_dim);
for i in 0..direction_dim {
direction[i] = last_output.get(i + 1).copied().unwrap_or(0.0).tanh();
}
let lr_factor_raw = last_output
.get(last_output.len() / 2)
.copied()
.unwrap_or(0.0);
let lr_adaptation_factor = (lr_factor_raw.tanh() + 1.0) * 0.5 + 0.5;
let conv_raw = last_output
.get(last_output.len() - 1)
.copied()
.unwrap_or(0.0);
let convergence_confidence = (conv_raw.tanh() + 1.0) * 0.5;
Ok(OptimizationStep {
step_size,
direction,
lr_adaptation_factor,
convergence_confidence,
attention_weights: self.get_attention_weights(),
})
}
fn get_attention_weights(&self) -> Array2<f64> {
if let Some(first_block) = self.transformer.transformer_blocks.first() {
if let Some(last_attention) = first_block.attention.attention_scores.last() {
return last_attention.clone();
}
}
Array2::zeros((1, 1))
}
fn update_adaptive_components(&mut self, step: &OptimizationStep) -> OptimizeResult<()> {
self.adaptive_components
.attention_adaptation
.update(&step.attention_weights)?;
self.adaptive_components
.learning_rate_adapter
.update(step.lr_adaptation_factor)?;
self.adaptive_components
.convergence_detector
.update(step.convergence_confidence)?;
Ok(())
}
pub fn adapt_to_problem_class(&mut self, problem_class: &str) -> OptimizeResult<()> {
match problem_class {
"quadratic" => {
self.adaptive_components
.attention_adaptation
.set_focus_pattern(
Array1::from(vec![0.1, 0.2, 0.7]), );
}
"neural_network" => {
self.adaptive_components
.attention_adaptation
.set_focus_pattern(
Array1::from(vec![0.3, 0.4, 0.3]), );
}
"sparse" => {
self.adaptive_components
.attention_adaptation
.set_focus_pattern(
Array1::from(vec![0.5, 0.3, 0.2]), );
}
_ => {
self.adaptive_components
.attention_adaptation
.set_focus_pattern(Array1::from(vec![0.3, 0.4, 0.3]));
}
}
Ok(())
}
pub fn fine_tune_on_trajectories(
&mut self,
trajectories: &[OptimizationTrajectory],
) -> OptimizeResult<()> {
for trajectory in trajectories {
for step in &trajectory.steps {
if step.improvement > 0.0 {
self.update_transformer_weights(&step.state_encoding, &step.action_encoding)?;
}
}
}
Ok(())
}
fn update_transformer_weights(
&mut self,
state_encoding: &Array2<f64>,
action_encoding: &Array1<f64>,
) -> OptimizeResult<()> {
let learning_rate = self.config.meta_learning_rate;
for i in 0..self
.transformer
.output_projection
.nrows()
.min(action_encoding.len())
{
for j in 0..self.transformer.output_projection.ncols() {
if let Some(&state_val) = state_encoding.get((state_encoding.nrows() - 1, j)) {
self.transformer.output_projection[[i, j]] +=
learning_rate * action_encoding[i] * state_val;
}
}
}
Ok(())
}
pub fn get_performance_metrics(&self) -> &TransformerMetrics {
&self.performance_metrics
}
fn update_performance_metrics(&mut self) {
if let Some(attention_scores) = self.get_latest_attention_scores() {
self.performance_metrics.attention_entropy =
compute_attention_entropy(&attention_scores);
}
self.performance_metrics.lr_adaptation_efficiency = self
.adaptive_components
.learning_rate_adapter
.get_efficiency();
self.performance_metrics.convergence_detection_accuracy =
self.adaptive_components.convergence_detector.get_accuracy();
}
fn get_latest_attention_scores(&self) -> Option<Array2<f64>> {
self.transformer
.transformer_blocks
.first()?
.attention
.attention_scores
.last()
.cloned()
}
}
#[derive(Debug, Clone)]
pub struct OptimizationStep {
pub step_size: f64,
pub direction: Array1<f64>,
pub lr_adaptation_factor: f64,
pub convergence_confidence: f64,
pub attention_weights: Array2<f64>,
}
#[derive(Debug, Clone)]
pub struct OptimizationTrajectory {
pub steps: Vec<TrajectoryStep>,
pub final_objective: f64,
pub success: bool,
}
#[derive(Debug, Clone)]
pub struct TrajectoryStep {
pub state_encoding: Array2<f64>,
pub action_encoding: Array1<f64>,
pub improvement: f64,
pub step_number: usize,
}
impl OptimizationTransformer {
pub fn new(num_heads: usize, model_dim: usize, max_seq_len: usize, num_layers: usize) -> Self {
let mut transformer_blocks = Vec::new();
for _ in 0..num_layers {
transformer_blocks.push(TransformerBlock::new(num_heads, model_dim));
}
let position_encoding = Self::create_position_encoding(max_seq_len, model_dim);
let input_embedding = Array2::from_shape_fn((model_dim, model_dim), |_| {
(scirs2_core::random::rng().random::<f64>() - 0.5) * (2.0 / model_dim as f64).sqrt()
});
let output_projection = Array2::from_shape_fn((model_dim, model_dim), |_| {
(scirs2_core::random::rng().random::<f64>() - 0.5) * (2.0 / model_dim as f64).sqrt()
});
Self {
num_layers,
transformer_blocks,
position_encoding,
input_embedding,
output_projection,
model_dim,
}
}
pub fn forward(&mut self, input_sequence: &ArrayView2<f64>) -> OptimizeResult<Array2<f64>> {
let seq_len = input_sequence.nrows();
let input_dim = input_sequence.ncols();
let mut embedded = Array2::zeros((seq_len, self.model_dim));
for i in 0..seq_len {
for j in 0..self.model_dim {
for k in 0..input_dim.min(self.input_embedding.ncols()) {
embedded[[i, j]] += self.input_embedding[[j, k]] * input_sequence[[i, k]];
}
}
}
for i in 0..seq_len.min(self.position_encoding.nrows()) {
for j in 0..self.model_dim.min(self.position_encoding.ncols()) {
embedded[[i, j]] += self.position_encoding[[i, j]];
}
}
let mut current = embedded;
for block in &mut self.transformer_blocks {
current = block.forward(¤t.view())?;
}
let mut output = Array2::zeros((seq_len, self.model_dim));
for i in 0..seq_len {
for j in 0..self.model_dim {
for k in 0..self.model_dim.min(self.output_projection.ncols()) {
output[[i, j]] += self.output_projection[[j, k]] * current[[i, k]];
}
}
}
Ok(output)
}
fn create_position_encoding(_max_len: usize, model_dim: usize) -> Array2<f64> {
let mut pos_encoding = Array2::zeros((_max_len, model_dim));
for pos in 0.._max_len {
for i in 0..model_dim {
let angle = pos as f64 / 10000_f64.powf(2.0 * i as f64 / model_dim as f64);
if i % 2 == 0 {
pos_encoding[[pos, i]] = angle.sin();
} else {
pos_encoding[[pos, i]] = angle.cos();
}
}
}
pos_encoding
}
}
impl TransformerBlock {
pub fn new(num_heads: usize, model_dim: usize) -> Self {
let attention = MultiHeadAttention::new(num_heads, model_dim);
let feed_forward = FeedForwardNetwork::new(model_dim, model_dim * 4);
let layer_norm1 = LayerNormalization::new(model_dim);
let layer_norm2 = LayerNormalization::new(model_dim);
Self {
attention,
feed_forward,
layer_norm1,
layer_norm2,
dropout_rate: 0.1,
}
}
pub fn forward(&mut self, input: &ArrayView2<f64>) -> OptimizeResult<Array2<f64>> {
let attention_output = self.attention.forward(input, input, input)?;
let residual1 = input + &attention_output.view();
let after_attention = self.layer_norm1.forward(&residual1.view())?;
let ff_output = self.feed_forward.forward(&after_attention.view())?;
let residual2 = &after_attention + &ff_output.view();
let output = self.layer_norm2.forward(&residual2.view())?;
Ok(output)
}
}
impl MultiHeadAttention {
pub fn new(num_heads: usize, model_dim: usize) -> Self {
assert_eq!(model_dim % num_heads, 0);
let head_dim = model_dim / num_heads;
let w_query = Array2::from_shape_fn((model_dim, model_dim), |_| {
(scirs2_core::random::rng().random::<f64>() - 0.5) * (2.0 / model_dim as f64).sqrt()
});
let w_key = Array2::from_shape_fn((model_dim, model_dim), |_| {
(scirs2_core::random::rng().random::<f64>() - 0.5) * (2.0 / model_dim as f64).sqrt()
});
let w_value = Array2::from_shape_fn((model_dim, model_dim), |_| {
(scirs2_core::random::rng().random::<f64>() - 0.5) * (2.0 / model_dim as f64).sqrt()
});
let w_output = Array2::from_shape_fn((model_dim, model_dim), |_| {
(scirs2_core::random::rng().random::<f64>() - 0.5) * (2.0 / model_dim as f64).sqrt()
});
Self {
num_heads,
head_dim,
w_query,
w_key,
w_value,
w_output,
attention_scores: Vec::new(),
}
}
pub fn forward(
&mut self,
query: &ArrayView2<f64>,
key: &ArrayView2<f64>,
value: &ArrayView2<f64>,
) -> OptimizeResult<Array2<f64>> {
let seq_len = query.nrows();
let model_dim = query.ncols();
let q = self.linear_transform(query, &self.w_query)?;
let k = self.linear_transform(key, &self.w_key)?;
let v = self.linear_transform(value, &self.w_value)?;
let mut attention_output = Array2::zeros((seq_len, model_dim));
for head in 0..self.num_heads {
let head_start = head * self.head_dim;
let head_end = head_start + self.head_dim;
let q_head = q.slice(scirs2_core::ndarray::s![.., head_start..head_end]);
let k_head = k.slice(scirs2_core::ndarray::s![.., head_start..head_end]);
let v_head = v.slice(scirs2_core::ndarray::s![.., head_start..head_end]);
let scores = self.compute_attention_scores(&q_head, &k_head)?;
let head_output = self.apply_attention(&scores, &v_head)?;
for i in 0..seq_len {
for j in 0..self.head_dim.min(model_dim - head_start) {
attention_output[[i, head_start + j]] = head_output[[i, j]];
}
}
}
let output = self.linear_transform(&attention_output.view(), &self.w_output)?;
Ok(output)
}
fn linear_transform(
&self,
input: &ArrayView2<f64>,
weight: &Array2<f64>,
) -> OptimizeResult<Array2<f64>> {
let seq_len = input.nrows();
let input_dim = input.ncols();
let output_dim = weight.nrows();
let mut output = Array2::zeros((seq_len, output_dim));
for i in 0..seq_len {
for j in 0..output_dim {
for k in 0..input_dim.min(weight.ncols()) {
output[[i, j]] += weight[[j, k]] * input[[i, k]];
}
}
}
Ok(output)
}
fn compute_attention_scores(
&mut self,
query: &ArrayView2<f64>,
key: &ArrayView2<f64>,
) -> OptimizeResult<Array2<f64>> {
let seq_len = query.nrows();
let head_dim = query.ncols();
let mut scores = Array2::zeros((seq_len, seq_len));
let scale = 1.0 / (head_dim as f64).sqrt();
for i in 0..seq_len {
for j in 0..seq_len {
let mut dot_product = 0.0;
for k in 0..head_dim {
dot_product += query[[i, k]] * key[[j, k]];
}
scores[[i, j]] = dot_product * scale;
}
}
for i in 0..seq_len {
let mut row_sum = 0.0;
let max_val = scores.row(i).fold(-f64::INFINITY, |a, &b| a.max(b));
for j in 0..seq_len {
scores[[i, j]] = (scores[[i, j]] - max_val).exp();
row_sum += scores[[i, j]];
}
if row_sum > 0.0 {
for j in 0..seq_len {
scores[[i, j]] /= row_sum;
}
}
}
self.attention_scores.push(scores.clone());
if self.attention_scores.len() > 10 {
self.attention_scores.remove(0);
}
Ok(scores)
}
fn apply_attention(
&self,
scores: &Array2<f64>,
values: &ArrayView2<f64>,
) -> OptimizeResult<Array2<f64>> {
let seq_len = scores.nrows();
let head_dim = values.ncols();
let mut output = Array2::zeros((seq_len, head_dim));
for i in 0..seq_len {
for j in 0..head_dim {
for k in 0..seq_len {
output[[i, j]] += scores[[i, k]] * values[[k, j]];
}
}
}
Ok(output)
}
}
impl FeedForwardNetwork {
pub fn new(input_dim: usize, hidden_dim: usize) -> Self {
let linear1 = Array2::from_shape_fn((hidden_dim, input_dim), |_| {
(scirs2_core::random::rng().random::<f64>() - 0.5) * (2.0 / input_dim as f64).sqrt()
});
let linear2 = Array2::from_shape_fn((input_dim, hidden_dim), |_| {
(scirs2_core::random::rng().random::<f64>() - 0.5) * (2.0 / hidden_dim as f64).sqrt()
});
Self {
linear1,
linear2,
bias1: Array1::zeros(hidden_dim),
bias2: Array1::zeros(input_dim),
activation: ActivationType::GELU,
hidden_dim,
}
}
pub fn forward(&self, input: &ArrayView2<f64>) -> OptimizeResult<Array2<f64>> {
let seq_len = input.nrows();
let input_dim = input.ncols();
let mut hidden = Array2::zeros((seq_len, self.hidden_dim));
for i in 0..seq_len {
for j in 0..self.hidden_dim {
for k in 0..input_dim.min(self.linear1.ncols()) {
hidden[[i, j]] += self.linear1[[j, k]] * input[[i, k]];
}
hidden[[i, j]] += self.bias1[j];
hidden[[i, j]] = self.activation.apply(hidden[[i, j]]);
}
}
let mut output = Array2::zeros((seq_len, input_dim));
for i in 0..seq_len {
for j in 0..input_dim {
for k in 0..self.hidden_dim.min(self.linear2.ncols()) {
output[[i, j]] += self.linear2[[j, k]] * hidden[[i, k]];
}
output[[i, j]] += self.bias2[j];
}
}
Ok(output)
}
}
impl LayerNormalization {
pub fn new(dim: usize) -> Self {
Self {
gamma: Array1::ones(dim),
beta: Array1::zeros(dim),
epsilon: 1e-6,
}
}
pub fn forward(&self, input: &ArrayView2<f64>) -> OptimizeResult<Array2<f64>> {
let seq_len = input.nrows();
let dim = input.ncols();
let mut output = Array2::zeros((seq_len, dim));
for i in 0..seq_len {
let row = input.row(i);
let mean = row.mean();
let var = input.row(i).variance();
let std = (var + self.epsilon).sqrt();
for j in 0..dim.min(self.gamma.len()) {
output[[i, j]] = self.gamma[j] * (input[[i, j]] - mean) / std + self.beta[j];
}
}
Ok(output)
}
}
impl TransformerProblemEncoder {
pub fn new(embedding_dim: usize) -> Self {
let feature_dim = 20;
Self {
gradient_encoder: Array2::from_shape_fn((embedding_dim, feature_dim), |_| {
(scirs2_core::random::rng().random::<f64>() - 0.5) * 0.1
}),
hessian_encoder: Array2::from_shape_fn((embedding_dim, feature_dim), |_| {
(scirs2_core::random::rng().random::<f64>() - 0.5) * 0.1
}),
parameter_encoder: Array2::from_shape_fn((embedding_dim, feature_dim), |_| {
(scirs2_core::random::rng().random::<f64>() - 0.5) * 0.1
}),
temporal_encoder: Array2::from_shape_fn((embedding_dim, feature_dim), |_| {
(scirs2_core::random::rng().random::<f64>() - 0.5) * 0.1
}),
context_encoder: Array2::from_shape_fn((embedding_dim, feature_dim), |_| {
(scirs2_core::random::rng().random::<f64>() - 0.5) * 0.1
}),
embedding_dim,
}
}
pub fn encode_current_state<F>(
&self,
objective: &F,
current_params: &ArrayView1<f64>,
problem: &OptimizationProblem,
) -> OptimizeResult<Array1<f64>>
where
F: Fn(&ArrayView1<f64>) -> f64,
{
let mut encoding = Array1::zeros(self.embedding_dim);
let param_features = self.encode_parameter_features(current_params);
let grad_features = self.encode_gradient_features(objective, current_params);
let context_features = self.encode_context_features(problem);
self.combine_features(&mut encoding, ¶m_features, &self.parameter_encoder);
self.combine_features(&mut encoding, &grad_features, &self.gradient_encoder);
self.combine_features(&mut encoding, &context_features, &self.context_encoder);
Ok(encoding)
}
fn encode_parameter_features(&self, params: &ArrayView1<f64>) -> Array1<f64> {
let mut features = Array1::zeros(20);
if !params.is_empty() {
features[0] = params.view().mean().tanh();
features[1] = params.view().variance().sqrt().tanh();
features[2] = params.fold(-f64::INFINITY, |a, &b| a.max(b)).tanh();
features[3] = params.fold(f64::INFINITY, |a, &b| a.min(b)).tanh();
features[4] = (params.len() as f64).ln().tanh();
features[5] =
(params.iter().map(|&x| x.abs()).sum::<f64>() / params.len() as f64).tanh(); features[6] = (params.iter().map(|&x| x * x).sum::<f64>()).sqrt().tanh();
let mean = features[0];
let skewness = params
.iter()
.map(|&x| ((x - mean) / (features[1] + 1e-8)).powi(3))
.sum::<f64>()
/ params.len() as f64;
features[7] = skewness.tanh();
let zero_count = params.iter().filter(|&&x| x.abs() < 1e-8).count();
features[8] = (zero_count as f64 / params.len() as f64).tanh();
}
features
}
fn encode_gradient_features<F>(&self, objective: &F, params: &ArrayView1<f64>) -> Array1<f64>
where
F: Fn(&ArrayView1<f64>) -> f64,
{
let mut features = Array1::zeros(20);
let h = 1e-6;
let f0 = objective(params);
let mut gradient = Array1::zeros(params.len());
for i in 0..params.len().min(20) {
let mut params_plus = params.to_owned();
params_plus[i] += h;
let f_plus = objective(¶ms_plus.view());
gradient[i] = (f_plus - f0) / h;
}
if !gradient.is_empty() {
features[0] = (gradient.iter().map(|&g| g * g).sum::<f64>())
.sqrt()
.ln()
.tanh(); features[1] = f0.abs().ln().tanh(); features[2] = gradient.view().mean().tanh(); features[3] = gradient.view().variance().sqrt().tanh();
let grad_consistency = gradient
.iter()
.zip(params.iter())
.map(|(&g, &p)| if p * g < 0.0 { 1.0 } else { 0.0 })
.sum::<f64>()
/ gradient.len() as f64;
features[4] = grad_consistency.tanh();
}
features
}
fn encode_context_features(&self, problem: &OptimizationProblem) -> Array1<f64> {
let mut features = Array1::zeros(20);
features[0] = (problem.dimension as f64).ln().tanh();
features[1] = (problem.max_evaluations as f64).ln().tanh();
features[2] = problem.target_accuracy.ln().abs().tanh();
match problem.problem_class.as_str() {
"quadratic" => features[3] = 1.0,
"neural_network" => features[4] = 1.0,
"sparse" => {
features[5] = 1.0;
features[6] = 1.0;
}
_ => {} }
features
}
fn combine_features(
&self,
encoding: &mut Array1<f64>,
features: &Array1<f64>,
encoder: &Array2<f64>,
) {
for i in 0..encoding.len() {
for j in 0..features.len().min(encoder.ncols()) {
encoding[i] += encoder[[i, j]] * features[j];
}
}
}
}
impl OptimizationHistory {
pub fn new(max_length: usize) -> Self {
Self {
parameter_history: VecDeque::with_capacity(max_length),
objective_history: VecDeque::with_capacity(max_length),
gradient_history: VecDeque::with_capacity(max_length),
step_size_history: VecDeque::with_capacity(max_length),
success_history: VecDeque::with_capacity(max_length),
max_length,
current_step: 0,
}
}
pub fn add_step(&mut self, params: Array1<f64>, objective: f64, step: OptimizationStep) {
if self.parameter_history.len() >= self.max_length {
self.parameter_history.pop_front();
self.objective_history.pop_front();
self.gradient_history.pop_front();
self.step_size_history.pop_front();
self.success_history.pop_front();
}
self.parameter_history.push_back(params);
self.objective_history.push_back(objective);
self.gradient_history.push_back(step.direction);
self.step_size_history.push_back(step.step_size);
self.success_history
.push_back(step.convergence_confidence > 0.5);
self.current_step += 1;
}
}
impl AdaptiveComponents {
pub fn new(model_dim: usize) -> Self {
Self {
attention_adaptation: AttentionAdaptation::new(model_dim),
learning_rate_adapter: LearningRateAdapter::new(),
gradient_scaler: GradientScaler::new(model_dim),
step_size_predictor: StepSizePredictor::new(model_dim),
convergence_detector: ConvergenceDetector::new(),
}
}
}
impl AttentionAdaptation {
pub fn new(model_dim: usize) -> Self {
Self {
adaptation_rate: 0.01,
attention_focus: Array1::from_elem(model_dim, 1.0 / model_dim as f64),
focus_history: VecDeque::with_capacity(100),
problem_patterns: HashMap::new(),
}
}
pub fn update(&mut self, attention_weights: &Array2<f64>) -> OptimizeResult<()> {
if attention_weights.is_empty() {
return Ok(());
}
let mut new_focus = Array1::zeros(self.attention_focus.len());
for i in 0..attention_weights.nrows().min(new_focus.len()) {
new_focus[i] = attention_weights.row(i).mean();
}
for i in 0..self.attention_focus.len() {
self.attention_focus[i] = (1.0 - self.adaptation_rate) * self.attention_focus[i]
+ self.adaptation_rate * new_focus.get(i).copied().unwrap_or(0.0);
}
self.focus_history.push_back(self.attention_focus.clone());
if self.focus_history.len() > 100 {
self.focus_history.pop_front();
}
Ok(())
}
pub fn set_focus_pattern(&mut self, pattern: Array1<f64>) {
if pattern.len() <= self.attention_focus.len() {
for (i, &val) in pattern.iter().enumerate() {
self.attention_focus[i] = val;
}
}
}
}
impl Default for LearningRateAdapter {
fn default() -> Self {
Self::new()
}
}
impl LearningRateAdapter {
pub fn new() -> Self {
Self {
base_lr: 0.01,
current_lr: 0.01,
adaptation_params: Array1::from(vec![0.9, 0.1, 0.001]),
performance_window: VecDeque::with_capacity(10),
lr_history: Vec::new(),
}
}
pub fn update(&mut self, lr_factor: f64) -> OptimizeResult<()> {
self.current_lr = self.base_lr * lr_factor;
self.lr_history.push(self.current_lr);
Ok(())
}
pub fn get_efficiency(&self) -> f64 {
if self.lr_history.len() < 2 {
return 0.5;
}
let recent_changes: Vec<f64> = self
.lr_history
.windows(2)
.map(|w| (w[1] - w[0]).abs())
.collect();
let avg_change = recent_changes.iter().sum::<f64>() / recent_changes.len() as f64;
(1.0 / (1.0 + avg_change)).min(1.0)
}
}
impl GradientScaler {
pub fn new(model_dim: usize) -> Self {
Self {
scale_factors: Array1::ones(model_dim),
gradient_stats: GradientStatistics {
mean: Array1::zeros(model_dim),
variance: Array1::ones(model_dim),
count: 0,
momentum: 0.9,
},
scaling_params: Array1::from_elem(model_dim, 1.0),
}
}
}
impl StepSizePredictor {
pub fn new(feature_dim: usize) -> Self {
Self {
predictor_network: Array2::from_shape_fn((1, feature_dim), |_| {
(scirs2_core::random::rng().random::<f64>() - 0.5) * 0.1
}),
feature_dim,
prediction_history: Vec::new(),
actual_steps: Vec::new(),
}
}
}
impl Default for ConvergenceDetector {
fn default() -> Self {
Self::new()
}
}
impl ConvergenceDetector {
pub fn new() -> Self {
Self {
threshold: 1e-6,
window_size: 10,
recent_improvements: VecDeque::with_capacity(10),
convergence_prob: 0.0,
}
}
pub fn update(&mut self, confidence: f64) -> OptimizeResult<()> {
self.convergence_prob = 0.9 * self.convergence_prob + 0.1 * confidence;
Ok(())
}
pub fn get_accuracy(&self) -> f64 {
self.convergence_prob
}
}
impl Default for TransformerMetrics {
fn default() -> Self {
Self {
attention_entropy: 0.0,
lr_adaptation_efficiency: 0.5,
gradient_prediction_accuracy: 0.5,
step_size_prediction_accuracy: 0.5,
convergence_detection_accuracy: 0.5,
}
}
}
impl LearnedOptimizer for AdaptiveTransformerOptimizer {
fn meta_train(&mut self, training_tasks: &[TrainingTask]) -> OptimizeResult<()> {
for task in training_tasks {
self.adapt_to_problem_class(&task.problem.problem_class)?;
let initial_params = match &task.initial_distribution {
super::ParameterDistribution::Uniform { low, high } => {
Array1::from_shape_fn(task.problem.dimension, |_| {
low + scirs2_core::random::rng().random::<f64>() * (high - low)
})
}
super::ParameterDistribution::Normal { mean, std } => {
Array1::from_shape_fn(task.problem.dimension, |_| {
mean + std * (scirs2_core::random::rng().random::<f64>() - 0.5) * 2.0
})
}
super::ParameterDistribution::Custom { samples } => {
if !samples.is_empty() {
samples[scirs2_core::random::rng().random_range(0..samples.len())].clone()
} else {
Array1::zeros(task.problem.dimension)
}
}
};
let training_objective = |x: &ArrayView1<f64>| x.iter().map(|&xi| xi * xi).sum::<f64>();
for _ in 0..10 {
let step = self.process_optimization_step(
&training_objective,
&initial_params.view(),
&task.problem,
)?;
self.update_performance_metrics();
}
}
Ok(())
}
fn adapt_to_problem(
&mut self,
problem: &OptimizationProblem,
initial_params: &ArrayView1<f64>,
) -> OptimizeResult<()> {
self.adapt_to_problem_class(&problem.problem_class)
}
fn optimize<F>(
&mut self,
objective: F,
initial_params: &ArrayView1<f64>,
) -> OptimizeResult<OptimizeResults<f64>>
where
F: Fn(&ArrayView1<f64>) -> f64,
{
let mut current_params = initial_params.to_owned();
let mut best_value = objective(initial_params);
let mut iterations = 0;
let default_problem = OptimizationProblem {
name: "unknown".to_string(),
dimension: initial_params.len(),
problem_class: "general".to_string(),
metadata: HashMap::new(),
max_evaluations: 1000,
target_accuracy: 1e-6,
};
for iter in 0..1000 {
iterations = iter;
let step = self.process_optimization_step(
&objective,
¤t_params.view(),
&default_problem,
)?;
for i in 0..current_params.len().min(step.direction.len()) {
current_params[i] -= step.step_size * step.direction[i];
}
let current_value = objective(¤t_params.view());
if current_value < best_value {
best_value = current_value;
}
if step.convergence_confidence > 0.95 || step.step_size < 1e-8 {
break;
}
}
Ok(OptimizeResults::<f64> {
x: current_params,
fun: best_value,
success: true,
nit: iterations,
message: "Transformer optimization completed".to_string(),
..OptimizeResults::default()
})
}
fn get_state(&self) -> &MetaOptimizerState {
&self.meta_state
}
fn reset(&mut self) {
self.history_buffer = OptimizationHistory::new(100);
self.performance_metrics = TransformerMetrics::default();
self.meta_state.episode = 0;
}
}
#[allow(dead_code)]
fn compute_attention_entropy(attention_scores: &Array2<f64>) -> f64 {
let mut total_entropy = 0.0;
let num_heads = attention_scores.nrows();
for i in 0..num_heads {
let row = attention_scores.row(i);
let entropy = -row
.iter()
.filter(|&&p| p > 1e-8)
.map(|&p| p * p.ln())
.sum::<f64>();
total_entropy += entropy;
}
total_entropy / num_heads as f64
}
#[allow(dead_code)]
pub fn transformer_optimize<F>(
objective: F,
initial_params: &ArrayView1<f64>,
config: Option<LearnedOptimizationConfig>,
) -> super::OptimizeResult<OptimizeResults<f64>>
where
F: Fn(&ArrayView1<f64>) -> f64,
{
let config = config.unwrap_or_default();
let mut optimizer = AdaptiveTransformerOptimizer::new(config);
optimizer.optimize(objective, initial_params)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_transformer_optimizer_creation() {
let config = LearnedOptimizationConfig::default();
let optimizer = AdaptiveTransformerOptimizer::new(config);
assert_eq!(optimizer.transformer.num_layers, 6);
assert!(!optimizer.transformer.transformer_blocks.is_empty());
}
#[test]
fn test_optimization_transformer() {
let transformer = OptimizationTransformer::new(4, 64, 100, 2);
assert_eq!(transformer.num_layers, 2);
assert_eq!(transformer.model_dim, 64);
assert_eq!(transformer.transformer_blocks.len(), 2);
}
#[test]
fn test_multi_head_attention() {
let attention = MultiHeadAttention::new(4, 64);
assert_eq!(attention.num_heads, 4);
assert_eq!(attention.head_dim, 16);
}
#[test]
fn test_transformer_forward_pass() {
let mut transformer = OptimizationTransformer::new(2, 32, 10, 1);
let input = Array2::from_shape_fn((5, 32), |_| scirs2_core::random::rng().random::<f64>());
let output = transformer
.forward(&input.view())
.expect("Operation failed");
assert_eq!(output.nrows(), 5);
assert_eq!(output.ncols(), 32);
}
#[test]
fn test_problem_encoder() {
let encoder = TransformerProblemEncoder::new(64);
let params = Array1::from(vec![1.0, 2.0]);
let objective = |x: &ArrayView1<f64>| x[0].powi(2) + x[1].powi(2);
let problem = OptimizationProblem {
name: "test".to_string(),
dimension: 2,
problem_class: "quadratic".to_string(),
metadata: HashMap::new(),
max_evaluations: 1000,
target_accuracy: 1e-6,
};
let encoding = encoder
.encode_current_state(&objective, ¶ms.view(), &problem)
.expect("Operation failed");
assert_eq!(encoding.len(), 64);
assert!(encoding.iter().all(|&x| x.is_finite()));
}
#[test]
#[ignore = "Real timeout - test runs >60 seconds"]
fn test_transformer_optimization() {
let objective = |x: &ArrayView1<f64>| x[0].powi(2) + x[1].powi(2);
let initial = Array1::from(vec![2.0, 2.0]);
let config = LearnedOptimizationConfig {
meta_training_episodes: 5,
hidden_size: 32,
num_heads: 2,
..Default::default()
};
let result = transformer_optimize(objective, &initial.view(), Some(config))
.expect("Operation failed");
assert!(result.fun >= 0.0);
assert_eq!(result.x.len(), 2);
assert!(result.success);
}
}
#[allow(dead_code)]
pub fn placeholder() {
}