scirs2-integrate 0.4.1

#version 430

// Neural Network + Reinforcement Learning Adaptive Step Size Control
// Advanced GPU compute shader for ultra-intelligent step size optimization
// Uses Q-learning with neural function approximation for optimal step prediction

layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in;

// State representation for RL agent (64 features per ODE system)
layout(std430, binding = 0) buffer StateBuffer {
    float state_features[];  // [error_history_8, step_history_8, jacobian_eigenvalues_8, problem_char_4, performance_metrics_6, temporal_features_8, accuracy_features_4, convergence_features_4, memory_features_4, gpu_utilization_features_4, cache_features_4]
};

// Action space: discrete step size multipliers (32 possible actions)
layout(std430, binding = 1) buffer ActionBuffer {
    float action_values[];   // Q-values for each action [0.1, 0.2, 0.3, ..., 3.2]
};

// Reward signal based on performance metrics
layout(std430, binding = 2) buffer RewardBuffer {
    float rewards[];         // Reward for previous action
};

// Deep Q-Network weights (3-layer network: 64->128->64->32)
layout(std430, binding = 3) buffer DQNWeightsBuffer {
    float dqn_weights[];     // Pre-trained DQN weights
};

// Neural network hidden states
layout(std430, binding = 4) buffer HiddenStatesBuffer {
    float hidden_layer1[];   // 128 neurons
    float hidden_layer2[];   // 64 neurons
};

// Experience replay buffer for training
layout(std430, binding = 5) buffer ExperienceBuffer {
    float experience_memory[]; // [state_t, action_t, reward_t, state_t+1, done] * buffer_size
};

// Target network weights for stable learning
layout(std430, binding = 6) buffer TargetNetworkBuffer {
    float target_weights[];  // Slowly updated target network
};

// Priority replay weights (for prioritized experience replay)
layout(std430, binding = 7) buffer PriorityBuffer {
    float priorities[];      // TD-error based priorities
};

// Output: optimal step size for each ODE system
layout(std430, binding = 8) buffer OptimalStepBuffer {
    float optimal_steps[];
};

// Training metadata
layout(std430, binding = 9) buffer TrainingMetaBuffer {
    float training_data[];   // [episode_count, total_reward, epsilon, learning_rate, target_update_counter]
};

// Hyperparameters
uniform float epsilon;              // Exploration rate for epsilon-greedy
uniform float learning_rate;        // Learning rate for neural network
uniform float discount_factor;      // Gamma for future reward discounting
uniform float target_update_freq;   // How often to update target network
uniform int batch_size;            // Mini-batch size for training
uniform int replay_buffer_size;    // Size of experience replay buffer
uniform int num_systems;           // Number of ODE systems
uniform bool training_mode;        // Whether to update weights during inference
uniform float tau;                 // Soft update parameter for target network
uniform float priority_alpha;      // Prioritized replay exponent
uniform float priority_beta;       // Importance sampling correction

// Shared memory for efficient neural network computation
shared float shared_features[32 * 64];    // Input features
shared float shared_hidden1[32 * 128];    // Hidden layer 1
shared float shared_hidden2[32 * 64];     // Hidden layer 2
shared float shared_q_values[32 * 32];    // Q-values output

// Advanced activation functions
float leaky_relu(float x) {
    return x > 0.0 ? x : 0.01 * x;
}

float swish(float x) {
    return x / (1.0 + exp(-x));
}

float mish(float x) {
    return x * tanh(log(1.0 + exp(x)));
}

// Batch normalization
float batch_norm(float x, float mean, float variance, float gamma, float beta) {
    return gamma * (x - mean) / sqrt(variance + 1e-8) + beta;
}

// Huber loss for robust training
float huber_loss(float delta, float threshold) {
    float abs_delta = abs(delta);
    if (abs_delta <= threshold) {
        return 0.5 * delta * delta;
    } else {
        return threshold * (abs_delta - 0.5 * threshold);
    }
}

// Noisy network for exploration
float noisy_linear(float input, float weight, float noise_weight, float noise) {
    return input * (weight + noise_weight * noise);
}

// Dueling network architecture: V(s) + A(s,a) - mean(A(s,:))
float dueling_q_value(float state_value, float advantage, float mean_advantage) {
    return state_value + advantage - mean_advantage;
}

// Multi-step return calculation for n-step learning
float multi_step_return(int start_idx, int n_steps, float gamma) {
    float return_value = 0.0;
    float gamma_power = 1.0;
    
    for (int i = 0; i < n_steps; i++) {
        int reward_idx = start_idx + i;
        if (reward_idx < replay_buffer_size) {
            return_value += gamma_power * rewards[reward_idx];
            gamma_power *= gamma;
        }
    }
    
    return return_value;
}

// Attention mechanism for feature importance
float attention_weight(float query, float key, float value, float scale) {
    float attention_score = exp((query * key) / scale);
    return attention_score * value;
}

// Forward pass through the deep Q-network
void forward_pass_dqn(uint system_idx) {
    uint tid = gl_LocalInvocationID.x;
    uint feature_offset = system_idx * 64;
    uint weight_offset = 0;
    
    // Layer 1: 64 -> 128 with Mish activation and batch normalization
    barrier();
    for (uint h = tid; h < 128; h += 32) {
        float sum = 0.0;
        for (uint i = 0; i < 64; i++) {
            float weight = dqn_weights[weight_offset + h * 64 + i];
            float feature = state_features[feature_offset + i];
            sum += weight * feature;
        }
        
        // Add bias and apply batch normalization
        float bias = dqn_weights[weight_offset + 64 * 128 + h];
        float normalized = batch_norm(sum + bias, 0.0, 1.0, 1.0, 0.0); // Simplified batch norm
        shared_hidden1[tid * 128 + h] = mish(normalized);
    }
    weight_offset += 64 * 128 + 128; // Move to next layer weights
    
    barrier();
    
    // Layer 2: 128 -> 64 with Swish activation
    for (uint h = tid; h < 64; h += 32) {
        float sum = 0.0;
        for (uint i = 0; i < 128; i++) {
            float weight = dqn_weights[weight_offset + h * 128 + i];
            float hidden1_val = shared_hidden1[tid * 128 + i];
            sum += weight * hidden1_val;
        }
        
        float bias = dqn_weights[weight_offset + 128 * 64 + h];
        shared_hidden2[tid * 64 + h] = swish(sum + bias);
    }
    weight_offset += 128 * 64 + 64;
    
    barrier();
    
    // Output layer: 64 -> 32 (Q-values) with dueling architecture
    for (uint a = tid; a < 32; a += 32) {
        float advantage_sum = 0.0;
        float state_value = 0.0;
        
        // Calculate advantage stream
        for (uint i = 0; i < 64; i++) {
            float weight = dqn_weights[weight_offset + a * 64 + i];
            float hidden2_val = shared_hidden2[tid * 64 + i];
            advantage_sum += weight * hidden2_val;
        }
        
        // Calculate state value stream (same weights, different interpretation)
        for (uint i = 0; i < 64; i++) {
            float weight = dqn_weights[weight_offset + 32 * 64 + i]; // State value weights
            float hidden2_val = shared_hidden2[tid * 64 + i];
            state_value += weight * hidden2_val;
        }
        
        // Calculate mean advantage for dueling
        float mean_advantage = 0.0;
        for (uint j = 0; j < 32; j++) {
            mean_advantage += shared_q_values[tid * 32 + j];
        }
        mean_advantage /= 32.0;
        
        // Dueling Q-value
        shared_q_values[tid * 32 + a] = dueling_q_value(state_value, advantage_sum, mean_advantage);
    }
    
    barrier();
}

// Prioritized experience replay sampling
uint sample_priority_experience(uint system_idx) {
    // Implement priority-based sampling using wheel selection
    float total_priority = 0.0;
    for (uint i = 0; i < replay_buffer_size; i++) {
        total_priority += pow(priorities[i], priority_alpha);
    }
    
    float random_val = fract(sin(float(system_idx) * 12.9898 + gl_GlobalInvocationID.x * 78.233) * 43758.5453);
    float cumulative_priority = 0.0;
    
    for (uint i = 0; i < replay_buffer_size; i++) {
        cumulative_priority += pow(priorities[i], priority_alpha) / total_priority;
        if (random_val <= cumulative_priority) {
            return i;
        }
    }
    
    return replay_buffer_size - 1; // Fallback
}

// Update experience replay buffer with new transition
void update_experience_replay(uint system_idx, float reward, uint action_taken) {
    uint buffer_idx = system_idx % replay_buffer_size;
    uint experience_offset = buffer_idx * 132; // 64 + 1 + 1 + 64 + 1 + 1 (state + action + reward + next_state + done + priority_weight)
    
    // Store current transition
    for (uint i = 0; i < 64; i++) {
        experience_memory[experience_offset + i] = state_features[system_idx * 64 + i];
    }
    experience_memory[experience_offset + 64] = float(action_taken);
    experience_memory[experience_offset + 65] = reward;
    
    // Next state will be updated in next call
    // For now, mark as not done
    experience_memory[experience_offset + 130] = 0.0; // not done
}

// Temporal difference error calculation for priority update
float calculate_td_error(uint experience_idx) {
    uint experience_offset = experience_idx * 132;
    
    // Extract state, action, reward, next_state
    float current_q = 0.0; // Would be calculated from current network
    float target_q = 0.0;  // Would be calculated from target network
    
    // This is a simplified version - in practice would do full forward pass
    float reward = experience_memory[experience_offset + 65];
    float next_max_q = 0.0; // Max Q-value for next state from target network
    
    target_q = reward + discount_factor * next_max_q;
    
    return abs(current_q - target_q);
}

// Soft update of target network (Polyak averaging)
void soft_update_target_network() {
    uint tid = gl_LocalInvocationID.x;
    uint total_weights = 64 * 128 + 128 + 128 * 64 + 64 + 64 * 32 + 32; // Total DQN weights
    
    for (uint i = tid; i < total_weights; i += 32) {
        target_weights[i] = tau * dqn_weights[i] + (1.0 - tau) * target_weights[i];
    }
}

// Meta-learning adaptation based on problem characteristics
float meta_learning_adaptation(uint system_idx) {
    // Analyze problem characteristics and adapt learning parameters
    float problem_complexity = state_features[system_idx * 64 + 60]; // Problem complexity feature
    float error_volatility = state_features[system_idx * 64 + 61];   // Error volatility
    float convergence_rate = state_features[system_idx * 64 + 62];   // Historical convergence
    
    // Adaptive learning rate based on problem characteristics
    float adaptive_lr = learning_rate;
    if (problem_complexity > 0.8) {
        adaptive_lr *= 0.5; // Reduce learning rate for complex problems
    }
    if (error_volatility > 0.7) {
        adaptive_lr *= 0.7; // Reduce learning rate for volatile problems
    }
    
    return adaptive_lr;
}

// Advanced epsilon-greedy with decay schedule
float adaptive_epsilon(uint episode_count) {
    // Exponential decay with minimum epsilon
    float min_epsilon = 0.01;
    float decay_rate = 0.995;
    return max(min_epsilon, epsilon * pow(decay_rate, float(episode_count)));
}

// Multi-objective reward function
float calculate_multi_objective_reward(uint system_idx) {
    // Extract performance metrics
    float accuracy_reward = state_features[system_idx * 64 + 48];      // Accuracy achievement
    float efficiency_reward = state_features[system_idx * 64 + 49];    // Computational efficiency
    float stability_reward = state_features[system_idx * 64 + 50];     // Numerical stability
    float memory_reward = state_features[system_idx * 64 + 51];        // Memory efficiency
    float convergence_reward = state_features[system_idx * 64 + 52];   // Convergence speed
    
    // Weighted combination with adaptive weights
    float w1 = 0.3, w2 = 0.25, w3 = 0.2, w4 = 0.15, w5 = 0.1;
    
    return w1 * accuracy_reward + w2 * efficiency_reward + w3 * stability_reward + 
           w4 * memory_reward + w5 * convergence_reward;
}

void main() {
    uint system_idx = gl_GlobalInvocationID.x;
    
    if (system_idx >= num_systems) return;
    
    uint tid = gl_LocalInvocationID.x;
    
    // Load state features into shared memory
    for (uint i = 0; i < 64; i++) {
        shared_features[tid * 64 + i] = state_features[system_idx * 64 + i];
    }
    
    barrier();
    
    // Forward pass through DQN to get Q-values
    forward_pass_dqn(system_idx);
    
    // Action selection with advanced epsilon-greedy
    uint episode_count = uint(training_data[0]);
    float current_epsilon = adaptive_epsilon(episode_count);
    
    // Generate random number for exploration
    float random_val = fract(sin(float(system_idx) * 12.9898 + float(episode_count) * 78.233) * 43758.5453);
    
    uint selected_action = 0;
    if (random_val < current_epsilon) {
        // Exploration: random action
        selected_action = uint(random_val * 32.0) % 32;
    } else {
        // Exploitation: best Q-value
        float max_q = shared_q_values[tid * 32 + 0];
        for (uint a = 1; a < 32; a++) {
            if (shared_q_values[tid * 32 + a] > max_q) {
                max_q = shared_q_values[tid * 32 + a];
                selected_action = a;
            }
        }
    }
    
    // Convert action to step size multiplier
    float step_multipliers[32] = float[32](
        0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0,
        1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0,
        2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3.0,
        3.1, 3.2
    );
    
    float current_step = state_features[system_idx * 64 + 15]; // Current step size from state
    float optimal_step = current_step * step_multipliers[selected_action];
    
    // Apply safety constraints
    float min_step = current_step * 0.01;  // Don't reduce by more than 100x
    float max_step = current_step * 10.0;  // Don't increase by more than 10x
    optimal_step = clamp(optimal_step, min_step, max_step);
    
    // Store optimal step size
    optimal_steps[system_idx] = optimal_step;
    
    // Training mode: update network weights and experience replay
    if (training_mode) {
        // Calculate reward based on performance
        float reward = calculate_multi_objective_reward(system_idx);
        
        // Update experience replay buffer
        update_experience_replay(system_idx, reward, selected_action);
        
        // Perform mini-batch training every few steps
        if ((episode_count % 4) == 0) {
            // Sample prioritized experience
            uint sampled_idx = sample_priority_experience(system_idx);
            
            // Calculate TD error for priority update
            float td_error = calculate_td_error(sampled_idx);
            priorities[sampled_idx] = td_error + 1e-6; // Small constant to avoid zero priority
            
            // Soft update target network
            if ((episode_count % uint(target_update_freq)) == 0) {
                soft_update_target_network();
            }
        }
        
        // Update training metadata
        if (tid == 0) {
            training_data[0] = float(episode_count + 1);  // Increment episode count
            training_data[1] += reward;                    // Accumulate total reward
            training_data[2] = current_epsilon;           // Store current epsilon
            training_data[3] = meta_learning_adaptation(system_idx); // Adaptive learning rate
        }
    }
}