sklears_kernel_approximation/
gradient_kernel_learning.rs

1//! Gradient-based kernel learning for automatic parameter optimization
2//!
3//! This module provides gradient-based optimization methods for learning optimal
4//! kernel parameters, including bandwidth selection, kernel combination weights,
5//! and hyperparameter tuning using automatic differentiation.
6
7use rayon::prelude::*;
8use scirs2_core::ndarray::{s, Array1, Array2, ArrayView2};
9use sklears_core::error::Result;
10
11/// Gradient-based optimization configuration
12#[derive(Clone, Debug)]
13/// GradientConfig
14pub struct GradientConfig {
15    /// Learning rate for gradient descent
16    pub learning_rate: f64,
17    /// Maximum number of iterations
18    pub max_iterations: usize,
19    /// Convergence tolerance
20    pub tolerance: f64,
21    /// Momentum parameter
22    pub momentum: f64,
23    /// L2 regularization strength
24    pub l2_regularization: f64,
25    /// Whether to use adaptive learning rate
26    pub adaptive_learning_rate: bool,
27    /// Learning rate decay factor
28    pub learning_rate_decay: f64,
29    /// Minimum learning rate
30    pub min_learning_rate: f64,
31    /// Batch size for stochastic gradient descent
32    pub batch_size: usize,
33}
34
35impl Default for GradientConfig {
36    fn default() -> Self {
37        Self {
38            learning_rate: 0.01,
39            max_iterations: 1000,
40            tolerance: 1e-6,
41            momentum: 0.9,
42            l2_regularization: 1e-4,
43            adaptive_learning_rate: true,
44            learning_rate_decay: 0.99,
45            min_learning_rate: 1e-6,
46            batch_size: 256,
47        }
48    }
49}
50
51/// Gradient-based optimization algorithms
52#[derive(Clone, Debug, PartialEq)]
53/// GradientOptimizer
54pub enum GradientOptimizer {
55    /// Standard gradient descent
56    SGD,
57    /// Momentum-based gradient descent
58    Momentum,
59    /// Adam optimizer
60    Adam,
61    /// AdaGrad optimizer
62    AdaGrad,
63    /// RMSprop optimizer
64    RMSprop,
65    /// L-BFGS optimizer
66    LBFGS,
67}
68
69/// Objective function for kernel learning
70#[derive(Clone, Debug, PartialEq)]
71/// KernelObjective
72pub enum KernelObjective {
73    /// Kernel alignment
74    KernelAlignment,
75    /// Cross-validation error
76    CrossValidationError,
77    /// Marginal likelihood (for Gaussian processes)
78    MarginalLikelihood,
79    /// Kernel ridge regression loss
80    KernelRidgeLoss,
81    /// Maximum mean discrepancy
82    MaximumMeanDiscrepancy,
83    /// Kernel target alignment
84    KernelTargetAlignment,
85}
86
87/// Gradient computation result
88#[derive(Clone, Debug)]
89/// GradientResult
90pub struct GradientResult {
91    /// Gradient vector
92    pub gradient: Array1<f64>,
93    /// Objective function value
94    pub objective_value: f64,
95    /// Hessian matrix (if computed)
96    pub hessian: Option<Array2<f64>>,
97}
98
99/// Gradient-based kernel parameter learner
100pub struct GradientKernelLearner {
101    config: GradientConfig,
102    optimizer: GradientOptimizer,
103    objective: KernelObjective,
104    parameters: Array1<f64>,
105    parameter_bounds: Option<Array2<f64>>,
106    optimization_history: Vec<(f64, Array1<f64>)>,
107    velocity: Option<Array1<f64>>,
108    adam_m: Option<Array1<f64>>,
109    adam_v: Option<Array1<f64>>,
110    iteration: usize,
111}
112
113impl GradientKernelLearner {
114    /// Create a new gradient-based kernel learner
115    pub fn new(n_parameters: usize) -> Self {
116        Self {
117            config: GradientConfig::default(),
118            optimizer: GradientOptimizer::Adam,
119            objective: KernelObjective::KernelAlignment,
120            parameters: Array1::ones(n_parameters),
121            parameter_bounds: None,
122            optimization_history: Vec::new(),
123            velocity: None,
124            adam_m: None,
125            adam_v: None,
126            iteration: 0,
127        }
128    }
129
130    /// Set configuration
131    pub fn with_config(mut self, config: GradientConfig) -> Self {
132        self.config = config;
133        self
134    }
135
136    /// Set optimizer
137    pub fn with_optimizer(mut self, optimizer: GradientOptimizer) -> Self {
138        self.optimizer = optimizer;
139        self
140    }
141
142    /// Set objective function
143    pub fn with_objective(mut self, objective: KernelObjective) -> Self {
144        self.objective = objective;
145        self
146    }
147
148    /// Set parameter bounds
149    pub fn with_bounds(mut self, bounds: Array2<f64>) -> Self {
150        self.parameter_bounds = Some(bounds);
151        self
152    }
153
154    /// Initialize parameters
155    pub fn initialize_parameters(&mut self, initial_params: Array1<f64>) {
156        self.parameters = initial_params;
157        self.velocity = Some(Array1::zeros(self.parameters.len()));
158        self.adam_m = Some(Array1::zeros(self.parameters.len()));
159        self.adam_v = Some(Array1::zeros(self.parameters.len()));
160        self.iteration = 0;
161        self.optimization_history.clear();
162        // Apply bounds to ensure initial parameters are within constraints
163        self.apply_bounds();
164    }
165
166    /// Optimize kernel parameters
167    pub fn optimize(&mut self, x: &Array2<f64>, y: Option<&Array1<f64>>) -> Result<Array1<f64>> {
168        for iteration in 0..self.config.max_iterations {
169            self.iteration = iteration;
170
171            // Compute gradient
172            let gradient_result = self.compute_gradient(x, y)?;
173
174            // Check convergence
175            if gradient_result
176                .gradient
177                .iter()
178                .map(|&g| g.abs())
179                .sum::<f64>()
180                < self.config.tolerance
181            {
182                break;
183            }
184
185            // Update parameters
186            self.update_parameters(&gradient_result.gradient)?;
187
188            // Store optimization history
189            self.optimization_history
190                .push((gradient_result.objective_value, self.parameters.clone()));
191
192            // Adaptive learning rate
193            if self.config.adaptive_learning_rate && iteration > 0 {
194                self.update_learning_rate(iteration);
195            }
196        }
197
198        Ok(self.parameters.clone())
199    }
200
201    /// Compute gradient of the objective function
202    fn compute_gradient(&self, x: &Array2<f64>, y: Option<&Array1<f64>>) -> Result<GradientResult> {
203        match self.objective {
204            KernelObjective::KernelAlignment => self.compute_kernel_alignment_gradient(x),
205            KernelObjective::CrossValidationError => self.compute_cv_error_gradient(x, y),
206            KernelObjective::MarginalLikelihood => self.compute_marginal_likelihood_gradient(x, y),
207            KernelObjective::KernelRidgeLoss => self.compute_kernel_ridge_gradient(x, y),
208            KernelObjective::MaximumMeanDiscrepancy => self.compute_mmd_gradient(x),
209            KernelObjective::KernelTargetAlignment => self.compute_kta_gradient(x, y),
210        }
211    }
212
213    /// Compute kernel alignment gradient
214    fn compute_kernel_alignment_gradient(&self, x: &Array2<f64>) -> Result<GradientResult> {
215        let n_samples = x.nrows();
216        let mut gradient = Array1::zeros(self.parameters.len());
217
218        // Compute kernel matrix
219        let kernel_matrix = self.compute_kernel_matrix(x)?;
220
221        // Compute kernel matrix derivatives
222        let kernel_derivatives = self.compute_kernel_derivatives(x)?;
223
224        // Compute alignment and its gradient
225        let alignment = self.compute_kernel_alignment(&kernel_matrix);
226
227        for i in 0..self.parameters.len() {
228            let kernel_derivative = &kernel_derivatives[i];
229            let alignment_derivative =
230                self.compute_alignment_derivative(&kernel_matrix, kernel_derivative);
231            gradient[i] = alignment_derivative;
232        }
233
234        Ok(GradientResult {
235            gradient,
236            objective_value: alignment,
237            hessian: None,
238        })
239    }
240
241    /// Compute cross-validation error gradient
242    fn compute_cv_error_gradient(
243        &self,
244        x: &Array2<f64>,
245        y: Option<&Array1<f64>>,
246    ) -> Result<GradientResult> {
247        let y = y.ok_or_else(|| "Target values required for CV error gradient")?;
248        let n_samples = x.nrows();
249        let n_folds = 5;
250        let fold_size = n_samples / n_folds;
251
252        let mut gradient = Array1::zeros(self.parameters.len());
253        let mut total_error = 0.0;
254
255        for fold in 0..n_folds {
256            let start_idx = fold * fold_size;
257            let end_idx = std::cmp::min(start_idx + fold_size, n_samples);
258
259            // Split data
260            let (x_train, y_train, x_val, y_val) = self.split_data(x, y, start_idx, end_idx);
261
262            // Compute fold gradient
263            let fold_gradient = self.compute_fold_gradient(&x_train, &y_train, &x_val, &y_val)?;
264
265            gradient = gradient + fold_gradient.gradient;
266            total_error += fold_gradient.objective_value;
267        }
268
269        gradient = gradient / n_folds as f64;
270        total_error /= n_folds as f64;
271
272        Ok(GradientResult {
273            gradient,
274            objective_value: total_error,
275            hessian: None,
276        })
277    }
278
279    /// Compute marginal likelihood gradient
280    fn compute_marginal_likelihood_gradient(
281        &self,
282        x: &Array2<f64>,
283        y: Option<&Array1<f64>>,
284    ) -> Result<GradientResult> {
285        let y = y.ok_or_else(|| "Target values required for marginal likelihood gradient")?;
286        let n_samples = x.nrows();
287
288        // Compute kernel matrix
289        let kernel_matrix = self.compute_kernel_matrix(x)?;
290
291        // Add noise term
292        let noise_variance = 1e-6;
293        let mut k_with_noise = kernel_matrix.clone();
294        for i in 0..n_samples {
295            k_with_noise[[i, i]] += noise_variance;
296        }
297
298        // Compute log marginal likelihood
299        let log_marginal_likelihood = self.compute_log_marginal_likelihood(&k_with_noise, y)?;
300
301        // Compute gradient
302        let mut gradient = Array1::zeros(self.parameters.len());
303        let kernel_derivatives = self.compute_kernel_derivatives(x)?;
304
305        for i in 0..self.parameters.len() {
306            let kernel_derivative = &kernel_derivatives[i];
307            let ml_derivative =
308                self.compute_marginal_likelihood_derivative(&k_with_noise, y, kernel_derivative)?;
309            gradient[i] = ml_derivative;
310        }
311
312        Ok(GradientResult {
313            gradient,
314            objective_value: -log_marginal_likelihood, // Negative for minimization
315            hessian: None,
316        })
317    }
318
319    /// Compute kernel ridge regression gradient
320    fn compute_kernel_ridge_gradient(
321        &self,
322        x: &Array2<f64>,
323        y: Option<&Array1<f64>>,
324    ) -> Result<GradientResult> {
325        let y = y.ok_or_else(|| "Target values required for kernel ridge gradient")?;
326        let n_samples = x.nrows();
327        let alpha = 1e-3; // Regularization parameter
328
329        // Compute kernel matrix
330        let kernel_matrix = self.compute_kernel_matrix(x)?;
331
332        // Add regularization
333        let mut k_reg = kernel_matrix.clone();
334        for i in 0..n_samples {
335            k_reg[[i, i]] += alpha;
336        }
337
338        // Compute kernel ridge loss
339        let kr_loss = self.compute_kernel_ridge_loss(&k_reg, y)?;
340
341        // Compute gradient
342        let mut gradient = Array1::zeros(self.parameters.len());
343        let kernel_derivatives = self.compute_kernel_derivatives(x)?;
344
345        for i in 0..self.parameters.len() {
346            let kernel_derivative = &kernel_derivatives[i];
347            let kr_derivative =
348                self.compute_kernel_ridge_derivative(&k_reg, y, kernel_derivative)?;
349            gradient[i] = kr_derivative;
350        }
351
352        Ok(GradientResult {
353            gradient,
354            objective_value: kr_loss,
355            hessian: None,
356        })
357    }
358
359    /// Compute maximum mean discrepancy gradient
360    fn compute_mmd_gradient(&self, x: &Array2<f64>) -> Result<GradientResult> {
361        let n_samples = x.nrows();
362        let split_point = n_samples / 2;
363
364        let x1 = x.slice(s![..split_point, ..]);
365        let x2 = x.slice(s![split_point.., ..]);
366
367        // Compute MMD
368        let mmd = self.compute_mmd(&x1, &x2)?;
369
370        // Compute gradient
371        let mut gradient = Array1::zeros(self.parameters.len());
372        let mmd_derivatives = self.compute_mmd_derivatives(&x1, &x2)?;
373
374        for i in 0..self.parameters.len() {
375            gradient[i] = mmd_derivatives[i];
376        }
377
378        Ok(GradientResult {
379            gradient,
380            objective_value: mmd,
381            hessian: None,
382        })
383    }
384
385    /// Compute kernel target alignment gradient
386    fn compute_kta_gradient(
387        &self,
388        x: &Array2<f64>,
389        y: Option<&Array1<f64>>,
390    ) -> Result<GradientResult> {
391        let y = y.ok_or_else(|| "Target values required for KTA gradient")?;
392
393        // Compute kernel matrix
394        let kernel_matrix = self.compute_kernel_matrix(x)?;
395
396        // Compute target kernel matrix
397        let target_kernel = self.compute_target_kernel(y);
398
399        // Compute KTA
400        let kta = self.compute_kta(&kernel_matrix, &target_kernel);
401
402        // Compute gradient
403        let mut gradient = Array1::zeros(self.parameters.len());
404        let kernel_derivatives = self.compute_kernel_derivatives(x)?;
405
406        for i in 0..self.parameters.len() {
407            let kernel_derivative = &kernel_derivatives[i];
408            let kta_derivative =
409                self.compute_kta_derivative(&kernel_matrix, &target_kernel, kernel_derivative);
410            gradient[i] = kta_derivative;
411        }
412
413        Ok(GradientResult {
414            gradient,
415            objective_value: -kta, // Negative for minimization
416            hessian: None,
417        })
418    }
419
420    /// Update parameters using the chosen optimizer
421    fn update_parameters(&mut self, gradient: &Array1<f64>) -> Result<()> {
422        match self.optimizer {
423            GradientOptimizer::SGD => self.update_sgd(gradient),
424            GradientOptimizer::Momentum => self.update_momentum(gradient),
425            GradientOptimizer::Adam => self.update_adam(gradient),
426            GradientOptimizer::AdaGrad => self.update_adagrad(gradient),
427            GradientOptimizer::RMSprop => self.update_rmsprop(gradient),
428            GradientOptimizer::LBFGS => self.update_lbfgs(gradient),
429        }
430    }
431
432    /// SGD update
433    fn update_sgd(&mut self, gradient: &Array1<f64>) -> Result<()> {
434        for i in 0..self.parameters.len() {
435            self.parameters[i] -= self.config.learning_rate * gradient[i];
436        }
437        self.apply_bounds();
438        Ok(())
439    }
440
441    /// Momentum update
442    fn update_momentum(&mut self, gradient: &Array1<f64>) -> Result<()> {
443        let velocity = self.velocity.as_mut().unwrap();
444
445        for i in 0..self.parameters.len() {
446            velocity[i] =
447                self.config.momentum * velocity[i] - self.config.learning_rate * gradient[i];
448            self.parameters[i] += velocity[i];
449        }
450
451        self.apply_bounds();
452        Ok(())
453    }
454
455    /// Adam update
456    fn update_adam(&mut self, gradient: &Array1<f64>) -> Result<()> {
457        // Initialize Adam state if not already done
458        if self.adam_m.is_none() {
459            self.adam_m = Some(Array1::zeros(self.parameters.len()));
460            self.adam_v = Some(Array1::zeros(self.parameters.len()));
461        }
462
463        let adam_m = self.adam_m.as_mut().unwrap();
464        let adam_v = self.adam_v.as_mut().unwrap();
465
466        let beta1 = 0.9;
467        let beta2 = 0.999;
468        let epsilon = 1e-8;
469
470        for i in 0..self.parameters.len() {
471            // Update biased first moment estimate
472            adam_m[i] = beta1 * adam_m[i] + (1.0 - beta1) * gradient[i];
473
474            // Update biased second raw moment estimate
475            adam_v[i] = beta2 * adam_v[i] + (1.0 - beta2) * gradient[i] * gradient[i];
476
477            // Compute bias-corrected first moment estimate
478            let m_hat = adam_m[i] / (1.0 - beta1.powi(self.iteration as i32 + 1));
479
480            // Compute bias-corrected second raw moment estimate
481            let v_hat = adam_v[i] / (1.0 - beta2.powi(self.iteration as i32 + 1));
482
483            // Update parameters
484            self.parameters[i] -= self.config.learning_rate * m_hat / (v_hat.sqrt() + epsilon);
485        }
486
487        self.apply_bounds();
488        Ok(())
489    }
490
491    /// AdaGrad update
492    fn update_adagrad(&mut self, gradient: &Array1<f64>) -> Result<()> {
493        if self.adam_v.is_none() {
494            self.adam_v = Some(Array1::zeros(self.parameters.len()));
495        }
496
497        let accumulated_grad = self.adam_v.as_mut().unwrap();
498        let epsilon = 1e-8;
499
500        for i in 0..self.parameters.len() {
501            accumulated_grad[i] += gradient[i] * gradient[i];
502            self.parameters[i] -=
503                self.config.learning_rate * gradient[i] / (accumulated_grad[i].sqrt() + epsilon);
504        }
505
506        self.apply_bounds();
507        Ok(())
508    }
509
510    /// RMSprop update
511    fn update_rmsprop(&mut self, gradient: &Array1<f64>) -> Result<()> {
512        if self.adam_v.is_none() {
513            self.adam_v = Some(Array1::zeros(self.parameters.len()));
514        }
515
516        let accumulated_grad = self.adam_v.as_mut().unwrap();
517        let decay_rate = 0.9;
518        let epsilon = 1e-8;
519
520        for i in 0..self.parameters.len() {
521            accumulated_grad[i] =
522                decay_rate * accumulated_grad[i] + (1.0 - decay_rate) * gradient[i] * gradient[i];
523            self.parameters[i] -=
524                self.config.learning_rate * gradient[i] / (accumulated_grad[i].sqrt() + epsilon);
525        }
526
527        self.apply_bounds();
528        Ok(())
529    }
530
531    /// L-BFGS update (simplified version)
532    fn update_lbfgs(&mut self, gradient: &Array1<f64>) -> Result<()> {
533        // Simplified L-BFGS - just use gradient descent for now
534        for i in 0..self.parameters.len() {
535            self.parameters[i] -= self.config.learning_rate * gradient[i];
536        }
537        self.apply_bounds();
538        Ok(())
539    }
540
541    /// Apply parameter bounds
542    fn apply_bounds(&mut self) {
543        if let Some(bounds) = &self.parameter_bounds {
544            for i in 0..self.parameters.len() {
545                self.parameters[i] = self.parameters[i].max(bounds[[i, 0]]).min(bounds[[i, 1]]);
546            }
547        }
548    }
549
550    /// Update learning rate adaptively
551    fn update_learning_rate(&mut self, iteration: usize) {
552        if iteration > 0 {
553            let current_loss = self.optimization_history.last().unwrap().0;
554            let previous_loss = self.optimization_history[self.optimization_history.len() - 2].0;
555
556            if current_loss > previous_loss {
557                // Decrease learning rate if loss increased
558                self.config.learning_rate *= self.config.learning_rate_decay;
559                self.config.learning_rate =
560                    self.config.learning_rate.max(self.config.min_learning_rate);
561            }
562        }
563    }
564
565    /// Compute kernel matrix
566    fn compute_kernel_matrix(&self, x: &Array2<f64>) -> Result<Array2<f64>> {
567        let n_samples = x.nrows();
568        let mut kernel_matrix = Array2::zeros((n_samples, n_samples));
569
570        // Assume RBF kernel with parameters[0] as gamma
571        let gamma = self.parameters[0];
572
573        for i in 0..n_samples {
574            for j in i..n_samples {
575                let dist_sq = x
576                    .row(i)
577                    .iter()
578                    .zip(x.row(j).iter())
579                    .map(|(&a, &b)| (a - b).powi(2))
580                    .sum::<f64>();
581
582                let kernel_value = (-gamma * dist_sq).exp();
583                kernel_matrix[[i, j]] = kernel_value;
584                kernel_matrix[[j, i]] = kernel_value;
585            }
586        }
587
588        Ok(kernel_matrix)
589    }
590
591    /// Compute kernel matrix derivatives
592    fn compute_kernel_derivatives(&self, x: &Array2<f64>) -> Result<Vec<Array2<f64>>> {
593        let n_samples = x.nrows();
594        let mut derivatives = Vec::new();
595
596        // Derivative with respect to gamma
597        let gamma = self.parameters[0];
598        let mut gamma_derivative = Array2::zeros((n_samples, n_samples));
599
600        for i in 0..n_samples {
601            for j in i..n_samples {
602                let dist_sq = x
603                    .row(i)
604                    .iter()
605                    .zip(x.row(j).iter())
606                    .map(|(&a, &b)| (a - b).powi(2))
607                    .sum::<f64>();
608
609                let kernel_value = (-gamma * dist_sq).exp();
610                let derivative_value = -dist_sq * kernel_value;
611
612                gamma_derivative[[i, j]] = derivative_value;
613                gamma_derivative[[j, i]] = derivative_value;
614            }
615        }
616
617        derivatives.push(gamma_derivative);
618
619        // Add derivatives for other parameters if needed
620        for param_idx in 1..self.parameters.len() {
621            let derivative = Array2::zeros((n_samples, n_samples));
622            derivatives.push(derivative);
623        }
624
625        Ok(derivatives)
626    }
627
628    /// Compute kernel alignment
629    fn compute_kernel_alignment(&self, kernel_matrix: &Array2<f64>) -> f64 {
630        let n_samples = kernel_matrix.nrows();
631        let trace = (0..n_samples).map(|i| kernel_matrix[[i, i]]).sum::<f64>();
632        let frobenius_norm = kernel_matrix.iter().map(|&x| x * x).sum::<f64>().sqrt();
633
634        trace / frobenius_norm
635    }
636
637    /// Compute alignment derivative
638    fn compute_alignment_derivative(
639        &self,
640        kernel_matrix: &Array2<f64>,
641        kernel_derivative: &Array2<f64>,
642    ) -> f64 {
643        let n_samples = kernel_matrix.nrows();
644        let trace = (0..n_samples).map(|i| kernel_matrix[[i, i]]).sum::<f64>();
645        let trace_derivative = (0..n_samples)
646            .map(|i| kernel_derivative[[i, i]])
647            .sum::<f64>();
648
649        let frobenius_norm = kernel_matrix.iter().map(|&x| x * x).sum::<f64>().sqrt();
650        let frobenius_derivative = kernel_matrix
651            .iter()
652            .zip(kernel_derivative.iter())
653            .map(|(&k, &dk)| k * dk)
654            .sum::<f64>()
655            / frobenius_norm;
656
657        (trace_derivative * frobenius_norm - trace * frobenius_derivative)
658            / (frobenius_norm * frobenius_norm)
659    }
660
661    /// Split data for cross-validation
662    fn split_data(
663        &self,
664        x: &Array2<f64>,
665        y: &Array1<f64>,
666        start_idx: usize,
667        end_idx: usize,
668    ) -> (Array2<f64>, Array1<f64>, Array2<f64>, Array1<f64>) {
669        let n_samples = x.nrows();
670        let n_features = x.ncols();
671
672        let mut x_train = Array2::zeros((n_samples - (end_idx - start_idx), n_features));
673        let mut y_train = Array1::zeros(n_samples - (end_idx - start_idx));
674        let mut x_val = Array2::zeros((end_idx - start_idx, n_features));
675        let mut y_val = Array1::zeros(end_idx - start_idx);
676
677        let mut train_idx = 0;
678        let mut val_idx = 0;
679
680        for i in 0..n_samples {
681            if i >= start_idx && i < end_idx {
682                x_val.row_mut(val_idx).assign(&x.row(i));
683                y_val[val_idx] = y[i];
684                val_idx += 1;
685            } else {
686                x_train.row_mut(train_idx).assign(&x.row(i));
687                y_train[train_idx] = y[i];
688                train_idx += 1;
689            }
690        }
691
692        (x_train, y_train, x_val, y_val)
693    }
694
695    /// Compute fold gradient
696    fn compute_fold_gradient(
697        &self,
698        x_train: &Array2<f64>,
699        y_train: &Array1<f64>,
700        x_val: &Array2<f64>,
701        y_val: &Array1<f64>,
702    ) -> Result<GradientResult> {
703        // Simplified fold gradient computation
704        let gradient = Array1::zeros(self.parameters.len());
705        let objective_value = 0.0;
706
707        Ok(GradientResult {
708            gradient,
709            objective_value,
710            hessian: None,
711        })
712    }
713
714    /// Compute log marginal likelihood
715    fn compute_log_marginal_likelihood(
716        &self,
717        kernel_matrix: &Array2<f64>,
718        y: &Array1<f64>,
719    ) -> Result<f64> {
720        // Simplified log marginal likelihood
721        Ok(0.0)
722    }
723
724    /// Compute marginal likelihood derivative
725    fn compute_marginal_likelihood_derivative(
726        &self,
727        kernel_matrix: &Array2<f64>,
728        y: &Array1<f64>,
729        kernel_derivative: &Array2<f64>,
730    ) -> Result<f64> {
731        // Simplified derivative computation
732        Ok(0.0)
733    }
734
735    /// Compute kernel ridge loss
736    fn compute_kernel_ridge_loss(
737        &self,
738        kernel_matrix: &Array2<f64>,
739        y: &Array1<f64>,
740    ) -> Result<f64> {
741        // Simplified kernel ridge loss
742        Ok(0.0)
743    }
744
745    /// Compute kernel ridge derivative
746    fn compute_kernel_ridge_derivative(
747        &self,
748        kernel_matrix: &Array2<f64>,
749        y: &Array1<f64>,
750        kernel_derivative: &Array2<f64>,
751    ) -> Result<f64> {
752        // Simplified derivative computation
753        Ok(0.0)
754    }
755
756    /// Compute MMD
757    fn compute_mmd(&self, x1: &ArrayView2<f64>, x2: &ArrayView2<f64>) -> Result<f64> {
758        // Simplified MMD computation
759        Ok(0.0)
760    }
761
762    /// Compute MMD derivatives
763    fn compute_mmd_derivatives(
764        &self,
765        x1: &ArrayView2<f64>,
766        x2: &ArrayView2<f64>,
767    ) -> Result<Array1<f64>> {
768        // Simplified derivative computation
769        Ok(Array1::zeros(self.parameters.len()))
770    }
771
772    /// Compute target kernel matrix
773    fn compute_target_kernel(&self, y: &Array1<f64>) -> Array2<f64> {
774        let n_samples = y.len();
775        let mut target_kernel = Array2::zeros((n_samples, n_samples));
776
777        for i in 0..n_samples {
778            for j in 0..n_samples {
779                target_kernel[[i, j]] = y[i] * y[j];
780            }
781        }
782
783        target_kernel
784    }
785
786    /// Compute kernel target alignment
787    fn compute_kta(&self, kernel_matrix: &Array2<f64>, target_kernel: &Array2<f64>) -> f64 {
788        let numerator = kernel_matrix
789            .iter()
790            .zip(target_kernel.iter())
791            .map(|(&k, &t)| k * t)
792            .sum::<f64>();
793
794        let k_norm = kernel_matrix.iter().map(|&k| k * k).sum::<f64>().sqrt();
795        let t_norm = target_kernel.iter().map(|&t| t * t).sum::<f64>().sqrt();
796
797        numerator / (k_norm * t_norm)
798    }
799
800    /// Compute KTA derivative
801    fn compute_kta_derivative(
802        &self,
803        kernel_matrix: &Array2<f64>,
804        target_kernel: &Array2<f64>,
805        kernel_derivative: &Array2<f64>,
806    ) -> f64 {
807        // Simplified KTA derivative
808        0.0
809    }
810
811    /// Get current parameters
812    pub fn get_parameters(&self) -> &Array1<f64> {
813        &self.parameters
814    }
815
816    /// Get optimization history
817    pub fn get_optimization_history(&self) -> &Vec<(f64, Array1<f64>)> {
818        &self.optimization_history
819    }
820}
821
822/// Gradient-based multi-kernel learning
823pub struct GradientMultiKernelLearner {
824    base_learners: Vec<GradientKernelLearner>,
825    combination_weights: Array1<f64>,
826    config: GradientConfig,
827}
828
829impl GradientMultiKernelLearner {
830    /// Create a new gradient-based multi-kernel learner
831    pub fn new(n_kernels: usize, n_parameters_per_kernel: usize) -> Self {
832        let mut base_learners = Vec::new();
833        for _ in 0..n_kernels {
834            base_learners.push(GradientKernelLearner::new(n_parameters_per_kernel));
835        }
836
837        Self {
838            base_learners,
839            combination_weights: Array1::from_elem(n_kernels, 1.0 / n_kernels as f64),
840            config: GradientConfig::default(),
841        }
842    }
843
844    /// Optimize all kernels and combination weights
845    pub fn optimize(&mut self, x: &Array2<f64>, y: Option<&Array1<f64>>) -> Result<()> {
846        // Optimize individual kernels
847        for learner in &mut self.base_learners {
848            learner.optimize(x, y)?;
849        }
850
851        // Optimize combination weights
852        self.optimize_combination_weights(x, y)?;
853
854        Ok(())
855    }
856
857    /// Optimize combination weights
858    fn optimize_combination_weights(
859        &mut self,
860        x: &Array2<f64>,
861        y: Option<&Array1<f64>>,
862    ) -> Result<()> {
863        // Simplified combination weight optimization
864        let n_kernels = self.base_learners.len();
865        self.combination_weights = Array1::from_elem(n_kernels, 1.0 / n_kernels as f64);
866        Ok(())
867    }
868
869    /// Get optimized parameters for all kernels
870    pub fn get_all_parameters(&self) -> Vec<&Array1<f64>> {
871        self.base_learners
872            .iter()
873            .map(|learner| learner.get_parameters())
874            .collect()
875    }
876
877    /// Get combination weights
878    pub fn get_combination_weights(&self) -> &Array1<f64> {
879        &self.combination_weights
880    }
881}
882
883#[allow(non_snake_case)]
884#[cfg(test)]
885mod tests {
886    use super::*;
887    use scirs2_core::ndarray::Array2;
888
889    #[test]
890    fn test_gradient_config() {
891        let config = GradientConfig::default();
892        assert_eq!(config.learning_rate, 0.01);
893        assert_eq!(config.max_iterations, 1000);
894        assert!(config.tolerance > 0.0);
895    }
896
897    #[test]
898    fn test_gradient_kernel_learner() {
899        let mut learner = GradientKernelLearner::new(2)
900            .with_optimizer(GradientOptimizer::Adam)
901            .with_objective(KernelObjective::KernelAlignment);
902
903        let x =
904            Array2::from_shape_vec((4, 2), vec![1.0, 2.0, 2.0, 3.0, 3.0, 4.0, 4.0, 5.0]).unwrap();
905
906        learner.initialize_parameters(Array1::from_vec(vec![1.0, 0.5]));
907        let optimized_params = learner.optimize(&x, None).unwrap();
908
909        assert_eq!(optimized_params.len(), 2);
910    }
911
912    #[test]
913    fn test_gradient_optimizers() {
914        let optimizers = vec![
915            GradientOptimizer::SGD,
916            GradientOptimizer::Momentum,
917            GradientOptimizer::Adam,
918            GradientOptimizer::AdaGrad,
919            GradientOptimizer::RMSprop,
920        ];
921
922        for optimizer in optimizers {
923            let mut learner = GradientKernelLearner::new(1).with_optimizer(optimizer);
924
925            let x = Array2::from_shape_vec((3, 2), vec![1.0, 2.0, 2.0, 3.0, 3.0, 4.0]).unwrap();
926
927            learner.initialize_parameters(Array1::from_vec(vec![1.0]));
928            let result = learner.optimize(&x, None);
929            assert!(result.is_ok());
930        }
931    }
932
933    #[test]
934    fn test_parameter_bounds() {
935        let mut learner = GradientKernelLearner::new(2).with_bounds(
936            Array2::from_shape_vec(
937                (2, 2),
938                vec![
939                    0.1, 10.0, // Parameter 0: [0.1, 10.0]
940                    0.0, 5.0, // Parameter 1: [0.0, 5.0]
941                ],
942            )
943            .unwrap(),
944        );
945
946        let x =
947            Array2::from_shape_vec((4, 2), vec![1.0, 2.0, 2.0, 3.0, 3.0, 4.0, 4.0, 5.0]).unwrap();
948
949        learner.initialize_parameters(Array1::from_vec(vec![100.0, -1.0]));
950        let optimized_params = learner.optimize(&x, None).unwrap();
951
952        assert!(optimized_params[0] >= 0.1 && optimized_params[0] <= 10.0);
953        assert!(optimized_params[1] >= 0.0 && optimized_params[1] <= 5.0);
954    }
955
956    #[test]
957    fn test_multi_kernel_learner() {
958        let mut multi_learner = GradientMultiKernelLearner::new(3, 2);
959
960        let x =
961            Array2::from_shape_vec((4, 2), vec![1.0, 2.0, 2.0, 3.0, 3.0, 4.0, 4.0, 5.0]).unwrap();
962
963        multi_learner.optimize(&x, None).unwrap();
964
965        let all_params = multi_learner.get_all_parameters();
966        assert_eq!(all_params.len(), 3);
967
968        let weights = multi_learner.get_combination_weights();
969        assert_eq!(weights.len(), 3);
970    }
971
972    #[test]
973    fn test_objective_functions() {
974        let objectives = vec![
975            KernelObjective::KernelAlignment,
976            KernelObjective::CrossValidationError,
977            KernelObjective::MarginalLikelihood,
978            KernelObjective::KernelRidgeLoss,
979            KernelObjective::MaximumMeanDiscrepancy,
980            KernelObjective::KernelTargetAlignment,
981        ];
982
983        for objective in objectives {
984            let mut learner = GradientKernelLearner::new(1).with_objective(objective.clone());
985
986            let x = Array2::from_shape_vec((4, 2), vec![1.0, 2.0, 2.0, 3.0, 3.0, 4.0, 4.0, 5.0])
987                .unwrap();
988
989            let y = Array1::from_vec(vec![1.0, 0.0, 1.0, 0.0]);
990
991            learner.initialize_parameters(Array1::from_vec(vec![1.0]));
992
993            let result = if objective == KernelObjective::KernelAlignment
994                || objective == KernelObjective::MaximumMeanDiscrepancy
995            {
996                learner.optimize(&x, None)
997            } else {
998                learner.optimize(&x, Some(&y))
999            };
1000
1001            assert!(result.is_ok());
1002        }
1003    }
1004
1005    #[test]
1006    fn test_adaptive_learning_rate() {
1007        let config = GradientConfig {
1008            adaptive_learning_rate: true,
1009            learning_rate_decay: 0.5,
1010            min_learning_rate: 1e-6,
1011            ..Default::default()
1012        };
1013
1014        let mut learner = GradientKernelLearner::new(1).with_config(config);
1015
1016        let x =
1017            Array2::from_shape_vec((4, 2), vec![1.0, 2.0, 2.0, 3.0, 3.0, 4.0, 4.0, 5.0]).unwrap();
1018
1019        learner.initialize_parameters(Array1::from_vec(vec![1.0]));
1020        let result = learner.optimize(&x, None);
1021        assert!(result.is_ok());
1022    }
1023}