quantrs2-core 0.1.3

//! Training utilities for quantum machine learning
//!
//! This module provides training loops, loss functions, and optimization
//! strategies for quantum machine learning models.

use super::{natural_gradient, quantum_fisher_information, QMLCircuit};
use crate::{
    error::{QuantRS2Error, QuantRS2Result},
    gpu::GpuBackendFactory,
};
use scirs2_core::ndarray::Array1;
use std::collections::HashMap;
// Note: scirs2_optimize functions would be used here if available

/// Loss functions for QML
#[derive(Debug, Clone, Copy)]
pub enum LossFunction {
    /// Mean squared error
    MSE,
    /// Cross entropy loss
    CrossEntropy,
    /// Fidelity loss
    Fidelity,
    /// Variational loss for VQE
    Variational,
    /// Custom loss function
    Custom,
}

/// Optimizer for QML models
#[derive(Debug, Clone)]
pub enum Optimizer {
    /// Gradient descent
    GradientDescent { learning_rate: f64 },
    /// Adam optimizer
    Adam {
        learning_rate: f64,
        beta1: f64,
        beta2: f64,
        epsilon: f64,
    },
    /// Natural gradient descent
    NaturalGradient {
        learning_rate: f64,
        regularization: f64,
    },
    /// BFGS optimizer
    BFGS,
    /// Quantum natural gradient
    QuantumNatural {
        learning_rate: f64,
        regularization: f64,
    },
}

/// Training configuration
#[derive(Debug, Clone)]
pub struct TrainingConfig {
    /// Maximum number of epochs
    pub max_epochs: usize,
    /// Batch size
    pub batch_size: usize,
    /// Convergence tolerance
    pub tolerance: f64,
    /// Whether to use GPU acceleration
    pub use_gpu: bool,
    /// Validation split ratio
    pub validation_split: f64,
    /// Early stopping patience
    pub early_stopping_patience: Option<usize>,
    /// Gradient clipping value
    pub gradient_clip: Option<f64>,
}

impl Default for TrainingConfig {
    fn default() -> Self {
        Self {
            max_epochs: 100,
            batch_size: 32,
            tolerance: 1e-6,
            use_gpu: true,
            validation_split: 0.2,
            early_stopping_patience: Some(10),
            gradient_clip: Some(1.0),
        }
    }
}

/// Training metrics
#[derive(Debug, Clone, Default)]
pub struct TrainingMetrics {
    /// Loss history
    pub loss_history: Vec<f64>,
    /// Validation loss history
    pub val_loss_history: Vec<f64>,
    /// Gradient norms
    pub gradient_norms: Vec<f64>,
    /// Parameter history
    pub parameter_history: Vec<Vec<f64>>,
    /// Best validation loss
    pub best_val_loss: f64,
    /// Best parameters
    pub best_parameters: Vec<f64>,
}

/// QML trainer
pub struct QMLTrainer {
    /// The quantum circuit
    circuit: QMLCircuit,
    /// Loss function
    loss_fn: LossFunction,
    /// Optimizer
    optimizer: Optimizer,
    /// Training configuration
    config: TrainingConfig,
    /// Training metrics
    metrics: TrainingMetrics,
    /// Adam optimizer state
    adam_state: Option<AdamState>,
}

/// Adam optimizer state
#[derive(Debug, Clone)]
struct AdamState {
    m: Vec<f64>, // First moment
    v: Vec<f64>, // Second moment
    t: usize,    // Time step
}

impl QMLTrainer {
    /// Create a new trainer
    pub fn new(
        circuit: QMLCircuit,
        loss_fn: LossFunction,
        optimizer: Optimizer,
        config: TrainingConfig,
    ) -> Self {
        let num_params = circuit.num_parameters;
        let adam_state = match &optimizer {
            Optimizer::Adam { .. } => Some(AdamState {
                m: vec![0.0; num_params],
                v: vec![0.0; num_params],
                t: 0,
            }),
            _ => None,
        };

        Self {
            circuit,
            loss_fn,
            optimizer,
            config,
            metrics: TrainingMetrics::default(),
            adam_state,
        }
    }

    /// Train the model
    pub fn train(
        &mut self,
        train_data: &[(Vec<f64>, Vec<f64>)],
        val_data: Option<&[(Vec<f64>, Vec<f64>)]>,
    ) -> QuantRS2Result<TrainingMetrics> {
        // Initialize GPU if requested
        let gpu_backend = if self.config.use_gpu {
            Some(GpuBackendFactory::create_best_available()?)
        } else {
            None
        };

        let mut best_val_loss = f64::INFINITY;
        let mut patience_counter = 0;

        for epoch in 0..self.config.max_epochs {
            // Training step
            let train_loss = self.train_epoch(train_data, &gpu_backend)?;
            self.metrics.loss_history.push(train_loss);

            // Validation step
            if let Some(val_data) = val_data {
                let val_loss = self.evaluate(val_data, &gpu_backend)?;
                self.metrics.val_loss_history.push(val_loss);

                // Early stopping
                if val_loss < best_val_loss {
                    best_val_loss = val_loss;
                    self.metrics.best_val_loss = val_loss;
                    self.metrics.best_parameters = self.get_parameters();
                    patience_counter = 0;
                } else if let Some(patience) = self.config.early_stopping_patience {
                    patience_counter += 1;
                    if patience_counter >= patience {
                        println!("Early stopping at epoch {epoch}");
                        break;
                    }
                }
            }

            // Check convergence
            if epoch > 0 {
                let loss_change =
                    (self.metrics.loss_history[epoch] - self.metrics.loss_history[epoch - 1]).abs();
                if loss_change < self.config.tolerance {
                    println!("Converged at epoch {epoch}");
                    break;
                }
            }

            // Log progress
            if epoch % 10 == 0 {
                println!("Epoch {epoch}: train_loss = {train_loss:.6}");
                if let Some(val_loss) = self.metrics.val_loss_history.last() {
                    println!("         val_loss = {val_loss:.6}");
                }
            }
        }

        Ok(self.metrics.clone())
    }

    /// Train for one epoch
    fn train_epoch(
        &mut self,
        data: &[(Vec<f64>, Vec<f64>)],
        gpu_backend: &Option<std::sync::Arc<dyn crate::gpu::GpuBackend>>,
    ) -> QuantRS2Result<f64> {
        let mut epoch_loss = 0.0;
        let num_batches = (data.len() + self.config.batch_size - 1) / self.config.batch_size;

        for batch_idx in 0..num_batches {
            let start = batch_idx * self.config.batch_size;
            let end = (start + self.config.batch_size).min(data.len());
            let batch = &data[start..end];

            // Compute gradients for batch
            let (loss, gradients) = self.compute_batch_gradients(batch, gpu_backend)?;
            epoch_loss += loss;

            // Apply gradient clipping if configured
            let clipped_gradients = if let Some(clip_value) = self.config.gradient_clip {
                self.clip_gradients(&gradients, clip_value)
            } else {
                gradients
            };

            // Update parameters
            self.update_parameters(&clipped_gradients)?;

            // Record gradient norm
            let grad_norm = clipped_gradients.iter().map(|g| g * g).sum::<f64>().sqrt();
            self.metrics.gradient_norms.push(grad_norm);
        }

        Ok(epoch_loss / num_batches as f64)
    }

    /// Compute gradients for a batch
    fn compute_batch_gradients(
        &self,
        batch: &[(Vec<f64>, Vec<f64>)],
        gpu_backend: &Option<std::sync::Arc<dyn crate::gpu::GpuBackend>>,
    ) -> QuantRS2Result<(f64, Vec<f64>)> {
        let mut total_loss = 0.0;
        let mut total_gradients = vec![0.0; self.circuit.num_parameters];

        for (input, target) in batch {
            // Forward pass
            let output = self.forward(input, gpu_backend)?;

            // Compute loss
            let loss = self.compute_loss(&output, target)?;
            total_loss += loss;

            // Compute gradients (placeholder - would use parameter shift rule)
            let gradients = vec![0.0; self.circuit.num_parameters]; // Placeholder

            // Accumulate gradients
            for (i, &grad) in gradients.iter().enumerate() {
                total_gradients[i] += grad;
            }
        }

        // Average over batch
        let batch_size = batch.len() as f64;
        total_loss /= batch_size;
        for grad in &mut total_gradients {
            *grad /= batch_size;
        }

        Ok((total_loss, total_gradients))
    }

    /// Forward pass through the circuit
    fn forward(
        &self,
        input: &[f64],
        _gpu_backend: &Option<std::sync::Arc<dyn crate::gpu::GpuBackend>>,
    ) -> QuantRS2Result<Vec<f64>> {
        // This is a placeholder implementation
        // In practice, would:
        // 1. Encode input data
        // 2. Apply circuit gates
        // 3. Measure or compute expectation values
        // 4. Return output

        Ok(vec![0.5; input.len()])
    }

    /// Compute loss
    fn compute_loss(&self, output: &[f64], target: &[f64]) -> QuantRS2Result<f64> {
        if output.len() != target.len() {
            return Err(QuantRS2Error::InvalidInput(
                "Output and target dimensions mismatch".to_string(),
            ));
        }

        match self.loss_fn {
            LossFunction::MSE => {
                let mse = output
                    .iter()
                    .zip(target.iter())
                    .map(|(o, t)| (o - t).powi(2))
                    .sum::<f64>()
                    / output.len() as f64;
                Ok(mse)
            }
            LossFunction::CrossEntropy => {
                let epsilon = 1e-10;
                let ce = -output
                    .iter()
                    .zip(target.iter())
                    .map(|(o, t)| t * (o + epsilon).ln())
                    .sum::<f64>()
                    / output.len() as f64;
                Ok(ce)
            }
            _ => Ok(0.0), // Placeholder for other loss functions
        }
    }

    /// Update parameters using the optimizer
    fn update_parameters(&mut self, gradients: &[f64]) -> QuantRS2Result<()> {
        let current_params = self.get_parameters();
        let new_params = match &mut self.optimizer {
            Optimizer::GradientDescent { learning_rate } => current_params
                .iter()
                .zip(gradients.iter())
                .map(|(p, g)| p - *learning_rate * g)
                .collect(),

            Optimizer::Adam {
                learning_rate,
                beta1,
                beta2,
                epsilon,
            } => {
                if let Some(state) = &mut self.adam_state {
                    state.t += 1;
                    let t = state.t as f64;

                    let mut new_params = vec![0.0; current_params.len()];
                    for i in 0..current_params.len() {
                        // Update biased first moment estimate
                        state.m[i] = (*beta1).mul_add(state.m[i], (1.0 - *beta1) * gradients[i]);

                        // Update biased second raw moment estimate
                        state.v[i] =
                            (*beta2).mul_add(state.v[i], (1.0 - *beta2) * gradients[i].powi(2));

                        // Compute bias-corrected first moment estimate
                        let m_hat = state.m[i] / (1.0 - beta1.powf(t));

                        // Compute bias-corrected second raw moment estimate
                        let v_hat = state.v[i] / (1.0 - beta2.powf(t));

                        // Update parameters
                        new_params[i] =
                            current_params[i] - *learning_rate * m_hat / (v_hat.sqrt() + *epsilon);
                    }
                    new_params
                } else {
                    current_params
                }
            }

            Optimizer::QuantumNatural {
                learning_rate: _,
                regularization,
            } => {
                // Compute quantum Fisher information
                let state = Array1::zeros(1 << self.circuit.config.num_qubits);
                let fisher = quantum_fisher_information(&self.circuit, &state)?;

                // Compute natural gradient
                natural_gradient(gradients, &fisher, *regularization)?
            }

            _ => current_params, // Placeholder for other optimizers
        };

        self.circuit.set_parameters(&new_params)?;
        self.metrics.parameter_history.push(new_params);

        Ok(())
    }

    /// Clip gradients
    fn clip_gradients(&self, gradients: &[f64], clip_value: f64) -> Vec<f64> {
        let norm = gradients.iter().map(|g| g * g).sum::<f64>().sqrt();

        if norm > clip_value {
            gradients.iter().map(|g| g * clip_value / norm).collect()
        } else {
            gradients.to_vec()
        }
    }

    /// Evaluate on a dataset
    fn evaluate(
        &self,
        data: &[(Vec<f64>, Vec<f64>)],
        gpu_backend: &Option<std::sync::Arc<dyn crate::gpu::GpuBackend>>,
    ) -> QuantRS2Result<f64> {
        let mut total_loss = 0.0;

        for (input, target) in data {
            let output = self.forward(input, gpu_backend)?;
            let loss = self.compute_loss(&output, target)?;
            total_loss += loss;
        }

        Ok(total_loss / data.len() as f64)
    }

    /// Get current parameters
    fn get_parameters(&self) -> Vec<f64> {
        self.circuit.parameters().iter().map(|p| p.value).collect()
    }
}

/// Hyperparameter optimization for QML
pub struct HyperparameterOptimizer {
    /// Search space
    search_space: HashMap<String, (f64, f64)>,
    /// Number of trials
    num_trials: usize,
    /// Optimization strategy
    strategy: HPOStrategy,
}

#[derive(Debug, Clone, Copy)]
pub enum HPOStrategy {
    /// Random search
    Random,
    /// Grid search
    Grid,
    /// Bayesian optimization
    Bayesian,
}

impl HyperparameterOptimizer {
    /// Create a new hyperparameter optimizer
    pub const fn new(
        search_space: HashMap<String, (f64, f64)>,
        num_trials: usize,
        strategy: HPOStrategy,
    ) -> Self {
        Self {
            search_space,
            num_trials,
            strategy,
        }
    }

    /// Run hyperparameter optimization
    pub fn optimize<F>(&self, _objective: F) -> QuantRS2Result<HashMap<String, f64>>
    where
        F: Fn(&HashMap<String, f64>) -> QuantRS2Result<f64>,
    {
        // Placeholder implementation
        // Would implement actual HPO strategies here
        Ok(HashMap::new())
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::qml::QMLConfig;

    #[test]
    fn test_trainer_creation() {
        let config = QMLConfig::default();
        let circuit = QMLCircuit::new(config);

        let trainer = QMLTrainer::new(
            circuit,
            LossFunction::MSE,
            Optimizer::Adam {
                learning_rate: 0.01,
                beta1: 0.9,
                beta2: 0.999,
                epsilon: 1e-8,
            },
            TrainingConfig::default(),
        );

        assert_eq!(trainer.metrics.loss_history.len(), 0);
    }

    #[test]
    fn test_gradient_clipping() {
        let config = QMLConfig::default();
        let circuit = QMLCircuit::new(config);
        let trainer = QMLTrainer::new(
            circuit,
            LossFunction::MSE,
            Optimizer::GradientDescent { learning_rate: 0.1 },
            TrainingConfig::default(),
        );

        let gradients = vec![3.0, 4.0]; // Norm = 5
        let clipped = trainer.clip_gradients(&gradients, 1.0);

        let norm = clipped.iter().map(|g| g * g).sum::<f64>().sqrt();
        assert!((norm - 1.0).abs() < 1e-10);
    }

    #[test]
    fn test_loss_computation() {
        let config = QMLConfig::default();
        let circuit = QMLCircuit::new(config);
        let trainer = QMLTrainer::new(
            circuit,
            LossFunction::MSE,
            Optimizer::GradientDescent { learning_rate: 0.1 },
            TrainingConfig::default(),
        );

        let output = vec![0.0, 0.5, 1.0];
        let target = vec![0.0, 0.0, 1.0];

        let loss = trainer
            .compute_loss(&output, &target)
            .expect("Loss computation should succeed");
        assert!((loss - 0.25 / 3.0).abs() < 1e-10); // MSE = (0 + 0.25 + 0) / 3
    }
}