Skip to main content

sklears_kernel_approximation/
gradient_kernel_learning.rs

1//! Gradient-based kernel learning for automatic parameter optimization
2//!
3//! This module provides gradient-based optimization methods for learning optimal
4//! kernel parameters, including bandwidth selection, kernel combination weights,
5//! and hyperparameter tuning using automatic differentiation.
6
7use scirs2_core::ndarray::{s, Array1, Array2, ArrayView2};
8use sklears_core::error::Result;
9
10/// Gradient-based optimization configuration
11#[derive(Clone, Debug)]
12/// GradientConfig
13pub struct GradientConfig {
14    /// Learning rate for gradient descent
15    pub learning_rate: f64,
16    /// Maximum number of iterations
17    pub max_iterations: usize,
18    /// Convergence tolerance
19    pub tolerance: f64,
20    /// Momentum parameter
21    pub momentum: f64,
22    /// L2 regularization strength
23    pub l2_regularization: f64,
24    /// Whether to use adaptive learning rate
25    pub adaptive_learning_rate: bool,
26    /// Learning rate decay factor
27    pub learning_rate_decay: f64,
28    /// Minimum learning rate
29    pub min_learning_rate: f64,
30    /// Batch size for stochastic gradient descent
31    pub batch_size: usize,
32}
33
34impl Default for GradientConfig {
35    fn default() -> Self {
36        Self {
37            learning_rate: 0.01,
38            max_iterations: 1000,
39            tolerance: 1e-6,
40            momentum: 0.9,
41            l2_regularization: 1e-4,
42            adaptive_learning_rate: true,
43            learning_rate_decay: 0.99,
44            min_learning_rate: 1e-6,
45            batch_size: 256,
46        }
47    }
48}
49
50/// Gradient-based optimization algorithms
51#[derive(Clone, Debug, PartialEq)]
52/// GradientOptimizer
53pub enum GradientOptimizer {
54    /// Standard gradient descent
55    SGD,
56    /// Momentum-based gradient descent
57    Momentum,
58    /// Adam optimizer
59    Adam,
60    /// AdaGrad optimizer
61    AdaGrad,
62    /// RMSprop optimizer
63    RMSprop,
64    /// L-BFGS optimizer
65    LBFGS,
66}
67
68/// Objective function for kernel learning
69#[derive(Clone, Debug, PartialEq)]
70/// KernelObjective
71pub enum KernelObjective {
72    /// Kernel alignment
73    KernelAlignment,
74    /// Cross-validation error
75    CrossValidationError,
76    /// Marginal likelihood (for Gaussian processes)
77    MarginalLikelihood,
78    /// Kernel ridge regression loss
79    KernelRidgeLoss,
80    /// Maximum mean discrepancy
81    MaximumMeanDiscrepancy,
82    /// Kernel target alignment
83    KernelTargetAlignment,
84}
85
86/// Gradient computation result
87#[derive(Clone, Debug)]
88/// GradientResult
89pub struct GradientResult {
90    /// Gradient vector
91    pub gradient: Array1<f64>,
92    /// Objective function value
93    pub objective_value: f64,
94    /// Hessian matrix (if computed)
95    pub hessian: Option<Array2<f64>>,
96}
97
98/// Gradient-based kernel parameter learner
99pub struct GradientKernelLearner {
100    config: GradientConfig,
101    optimizer: GradientOptimizer,
102    objective: KernelObjective,
103    parameters: Array1<f64>,
104    parameter_bounds: Option<Array2<f64>>,
105    optimization_history: Vec<(f64, Array1<f64>)>,
106    velocity: Option<Array1<f64>>,
107    adam_m: Option<Array1<f64>>,
108    adam_v: Option<Array1<f64>>,
109    iteration: usize,
110}
111
112impl GradientKernelLearner {
113    /// Create a new gradient-based kernel learner
114    pub fn new(n_parameters: usize) -> Self {
115        Self {
116            config: GradientConfig::default(),
117            optimizer: GradientOptimizer::Adam,
118            objective: KernelObjective::KernelAlignment,
119            parameters: Array1::ones(n_parameters),
120            parameter_bounds: None,
121            optimization_history: Vec::new(),
122            velocity: None,
123            adam_m: None,
124            adam_v: None,
125            iteration: 0,
126        }
127    }
128
129    /// Set configuration
130    pub fn with_config(mut self, config: GradientConfig) -> Self {
131        self.config = config;
132        self
133    }
134
135    /// Set optimizer
136    pub fn with_optimizer(mut self, optimizer: GradientOptimizer) -> Self {
137        self.optimizer = optimizer;
138        self
139    }
140
141    /// Set objective function
142    pub fn with_objective(mut self, objective: KernelObjective) -> Self {
143        self.objective = objective;
144        self
145    }
146
147    /// Set parameter bounds
148    pub fn with_bounds(mut self, bounds: Array2<f64>) -> Self {
149        self.parameter_bounds = Some(bounds);
150        self
151    }
152
153    /// Initialize parameters
154    pub fn initialize_parameters(&mut self, initial_params: Array1<f64>) {
155        self.parameters = initial_params;
156        self.velocity = Some(Array1::zeros(self.parameters.len()));
157        self.adam_m = Some(Array1::zeros(self.parameters.len()));
158        self.adam_v = Some(Array1::zeros(self.parameters.len()));
159        self.iteration = 0;
160        self.optimization_history.clear();
161        // Apply bounds to ensure initial parameters are within constraints
162        self.apply_bounds();
163    }
164
165    /// Optimize kernel parameters
166    pub fn optimize(&mut self, x: &Array2<f64>, y: Option<&Array1<f64>>) -> Result<Array1<f64>> {
167        for iteration in 0..self.config.max_iterations {
168            self.iteration = iteration;
169
170            // Compute gradient
171            let gradient_result = self.compute_gradient(x, y)?;
172
173            // Check convergence
174            if gradient_result
175                .gradient
176                .iter()
177                .map(|&g| g.abs())
178                .sum::<f64>()
179                < self.config.tolerance
180            {
181                break;
182            }
183
184            // Update parameters
185            self.update_parameters(&gradient_result.gradient)?;
186
187            // Store optimization history
188            self.optimization_history
189                .push((gradient_result.objective_value, self.parameters.clone()));
190
191            // Adaptive learning rate
192            if self.config.adaptive_learning_rate && iteration > 0 {
193                self.update_learning_rate(iteration);
194            }
195        }
196
197        Ok(self.parameters.clone())
198    }
199
200    /// Compute gradient of the objective function
201    fn compute_gradient(&self, x: &Array2<f64>, y: Option<&Array1<f64>>) -> Result<GradientResult> {
202        match self.objective {
203            KernelObjective::KernelAlignment => self.compute_kernel_alignment_gradient(x),
204            KernelObjective::CrossValidationError => self.compute_cv_error_gradient(x, y),
205            KernelObjective::MarginalLikelihood => self.compute_marginal_likelihood_gradient(x, y),
206            KernelObjective::KernelRidgeLoss => self.compute_kernel_ridge_gradient(x, y),
207            KernelObjective::MaximumMeanDiscrepancy => self.compute_mmd_gradient(x),
208            KernelObjective::KernelTargetAlignment => self.compute_kta_gradient(x, y),
209        }
210    }
211
212    /// Compute kernel alignment gradient
213    fn compute_kernel_alignment_gradient(&self, x: &Array2<f64>) -> Result<GradientResult> {
214        let _n_samples = x.nrows();
215        let mut gradient = Array1::zeros(self.parameters.len());
216
217        // Compute kernel matrix
218        let kernel_matrix = self.compute_kernel_matrix(x)?;
219
220        // Compute kernel matrix derivatives
221        let kernel_derivatives = self.compute_kernel_derivatives(x)?;
222
223        // Compute alignment and its gradient
224        let alignment = self.compute_kernel_alignment(&kernel_matrix);
225
226        for i in 0..self.parameters.len() {
227            let kernel_derivative = &kernel_derivatives[i];
228            let alignment_derivative =
229                self.compute_alignment_derivative(&kernel_matrix, kernel_derivative);
230            gradient[i] = alignment_derivative;
231        }
232
233        Ok(GradientResult {
234            gradient,
235            objective_value: alignment,
236            hessian: None,
237        })
238    }
239
240    /// Compute cross-validation error gradient
241    fn compute_cv_error_gradient(
242        &self,
243        x: &Array2<f64>,
244        y: Option<&Array1<f64>>,
245    ) -> Result<GradientResult> {
246        let y = y.ok_or("Target values required for CV error gradient")?;
247        let n_samples = x.nrows();
248        let n_folds = 5;
249        let fold_size = n_samples / n_folds;
250
251        let mut gradient = Array1::zeros(self.parameters.len());
252        let mut total_error = 0.0;
253
254        for fold in 0..n_folds {
255            let start_idx = fold * fold_size;
256            let end_idx = std::cmp::min(start_idx + fold_size, n_samples);
257
258            // Split data
259            let (x_train, y_train, x_val, y_val) = self.split_data(x, y, start_idx, end_idx);
260
261            // Compute fold gradient
262            let fold_gradient = self.compute_fold_gradient(&x_train, &y_train, &x_val, &y_val)?;
263
264            gradient = gradient + fold_gradient.gradient;
265            total_error += fold_gradient.objective_value;
266        }
267
268        gradient /= n_folds as f64;
269        total_error /= n_folds as f64;
270
271        Ok(GradientResult {
272            gradient,
273            objective_value: total_error,
274            hessian: None,
275        })
276    }
277
278    /// Compute marginal likelihood gradient
279    fn compute_marginal_likelihood_gradient(
280        &self,
281        x: &Array2<f64>,
282        y: Option<&Array1<f64>>,
283    ) -> Result<GradientResult> {
284        let y = y.ok_or("Target values required for marginal likelihood gradient")?;
285        let n_samples = x.nrows();
286
287        // Compute kernel matrix
288        let kernel_matrix = self.compute_kernel_matrix(x)?;
289
290        // Add noise term
291        let noise_variance = 1e-6;
292        let mut k_with_noise = kernel_matrix.clone();
293        for i in 0..n_samples {
294            k_with_noise[[i, i]] += noise_variance;
295        }
296
297        // Compute log marginal likelihood
298        let log_marginal_likelihood = self.compute_log_marginal_likelihood(&k_with_noise, y)?;
299
300        // Compute gradient
301        let mut gradient = Array1::zeros(self.parameters.len());
302        let kernel_derivatives = self.compute_kernel_derivatives(x)?;
303
304        for i in 0..self.parameters.len() {
305            let kernel_derivative = &kernel_derivatives[i];
306            let ml_derivative =
307                self.compute_marginal_likelihood_derivative(&k_with_noise, y, kernel_derivative)?;
308            gradient[i] = ml_derivative;
309        }
310
311        Ok(GradientResult {
312            gradient,
313            objective_value: -log_marginal_likelihood, // Negative for minimization
314            hessian: None,
315        })
316    }
317
318    /// Compute kernel ridge regression gradient
319    fn compute_kernel_ridge_gradient(
320        &self,
321        x: &Array2<f64>,
322        y: Option<&Array1<f64>>,
323    ) -> Result<GradientResult> {
324        let y = y.ok_or("Target values required for kernel ridge gradient")?;
325        let n_samples = x.nrows();
326        let alpha = 1e-3; // Regularization parameter
327
328        // Compute kernel matrix
329        let kernel_matrix = self.compute_kernel_matrix(x)?;
330
331        // Add regularization
332        let mut k_reg = kernel_matrix.clone();
333        for i in 0..n_samples {
334            k_reg[[i, i]] += alpha;
335        }
336
337        // Compute kernel ridge loss
338        let kr_loss = self.compute_kernel_ridge_loss(&k_reg, y)?;
339
340        // Compute gradient
341        let mut gradient = Array1::zeros(self.parameters.len());
342        let kernel_derivatives = self.compute_kernel_derivatives(x)?;
343
344        for i in 0..self.parameters.len() {
345            let kernel_derivative = &kernel_derivatives[i];
346            let kr_derivative =
347                self.compute_kernel_ridge_derivative(&k_reg, y, kernel_derivative)?;
348            gradient[i] = kr_derivative;
349        }
350
351        Ok(GradientResult {
352            gradient,
353            objective_value: kr_loss,
354            hessian: None,
355        })
356    }
357
358    /// Compute maximum mean discrepancy gradient
359    fn compute_mmd_gradient(&self, x: &Array2<f64>) -> Result<GradientResult> {
360        let n_samples = x.nrows();
361        let split_point = n_samples / 2;
362
363        let x1 = x.slice(s![..split_point, ..]);
364        let x2 = x.slice(s![split_point.., ..]);
365
366        // Compute MMD
367        let mmd = self.compute_mmd(&x1, &x2)?;
368
369        // Compute gradient
370        let mut gradient = Array1::zeros(self.parameters.len());
371        let mmd_derivatives = self.compute_mmd_derivatives(&x1, &x2)?;
372
373        for i in 0..self.parameters.len() {
374            gradient[i] = mmd_derivatives[i];
375        }
376
377        Ok(GradientResult {
378            gradient,
379            objective_value: mmd,
380            hessian: None,
381        })
382    }
383
384    /// Compute kernel target alignment gradient
385    fn compute_kta_gradient(
386        &self,
387        x: &Array2<f64>,
388        y: Option<&Array1<f64>>,
389    ) -> Result<GradientResult> {
390        let y = y.ok_or("Target values required for KTA gradient")?;
391
392        // Compute kernel matrix
393        let kernel_matrix = self.compute_kernel_matrix(x)?;
394
395        // Compute target kernel matrix
396        let target_kernel = self.compute_target_kernel(y);
397
398        // Compute KTA
399        let kta = self.compute_kta(&kernel_matrix, &target_kernel);
400
401        // Compute gradient
402        let mut gradient = Array1::zeros(self.parameters.len());
403        let kernel_derivatives = self.compute_kernel_derivatives(x)?;
404
405        for i in 0..self.parameters.len() {
406            let kernel_derivative = &kernel_derivatives[i];
407            let kta_derivative =
408                self.compute_kta_derivative(&kernel_matrix, &target_kernel, kernel_derivative);
409            gradient[i] = kta_derivative;
410        }
411
412        Ok(GradientResult {
413            gradient,
414            objective_value: -kta, // Negative for minimization
415            hessian: None,
416        })
417    }
418
419    /// Update parameters using the chosen optimizer
420    fn update_parameters(&mut self, gradient: &Array1<f64>) -> Result<()> {
421        match self.optimizer {
422            GradientOptimizer::SGD => self.update_sgd(gradient),
423            GradientOptimizer::Momentum => self.update_momentum(gradient),
424            GradientOptimizer::Adam => self.update_adam(gradient),
425            GradientOptimizer::AdaGrad => self.update_adagrad(gradient),
426            GradientOptimizer::RMSprop => self.update_rmsprop(gradient),
427            GradientOptimizer::LBFGS => self.update_lbfgs(gradient),
428        }
429    }
430
431    /// SGD update
432    fn update_sgd(&mut self, gradient: &Array1<f64>) -> Result<()> {
433        for i in 0..self.parameters.len() {
434            self.parameters[i] -= self.config.learning_rate * gradient[i];
435        }
436        self.apply_bounds();
437        Ok(())
438    }
439
440    /// Momentum update
441    fn update_momentum(&mut self, gradient: &Array1<f64>) -> Result<()> {
442        let velocity = self.velocity.as_mut().expect("operation should succeed");
443
444        for i in 0..self.parameters.len() {
445            velocity[i] =
446                self.config.momentum * velocity[i] - self.config.learning_rate * gradient[i];
447            self.parameters[i] += velocity[i];
448        }
449
450        self.apply_bounds();
451        Ok(())
452    }
453
454    /// Adam update
455    fn update_adam(&mut self, gradient: &Array1<f64>) -> Result<()> {
456        // Initialize Adam state if not already done
457        if self.adam_m.is_none() {
458            self.adam_m = Some(Array1::zeros(self.parameters.len()));
459            self.adam_v = Some(Array1::zeros(self.parameters.len()));
460        }
461
462        let adam_m = self.adam_m.as_mut().expect("operation should succeed");
463        let adam_v = self.adam_v.as_mut().expect("operation should succeed");
464
465        let beta1 = 0.9;
466        let beta2 = 0.999;
467        let epsilon = 1e-8;
468
469        for i in 0..self.parameters.len() {
470            // Update biased first moment estimate
471            adam_m[i] = beta1 * adam_m[i] + (1.0 - beta1) * gradient[i];
472
473            // Update biased second raw moment estimate
474            adam_v[i] = beta2 * adam_v[i] + (1.0 - beta2) * gradient[i] * gradient[i];
475
476            // Compute bias-corrected first moment estimate
477            let m_hat = adam_m[i] / (1.0 - beta1.powi(self.iteration as i32 + 1));
478
479            // Compute bias-corrected second raw moment estimate
480            let v_hat = adam_v[i] / (1.0 - beta2.powi(self.iteration as i32 + 1));
481
482            // Update parameters
483            self.parameters[i] -= self.config.learning_rate * m_hat / (v_hat.sqrt() + epsilon);
484        }
485
486        self.apply_bounds();
487        Ok(())
488    }
489
490    /// AdaGrad update
491    fn update_adagrad(&mut self, gradient: &Array1<f64>) -> Result<()> {
492        if self.adam_v.is_none() {
493            self.adam_v = Some(Array1::zeros(self.parameters.len()));
494        }
495
496        let accumulated_grad = self.adam_v.as_mut().expect("operation should succeed");
497        let epsilon = 1e-8;
498
499        for i in 0..self.parameters.len() {
500            accumulated_grad[i] += gradient[i] * gradient[i];
501            self.parameters[i] -=
502                self.config.learning_rate * gradient[i] / (accumulated_grad[i].sqrt() + epsilon);
503        }
504
505        self.apply_bounds();
506        Ok(())
507    }
508
509    /// RMSprop update
510    fn update_rmsprop(&mut self, gradient: &Array1<f64>) -> Result<()> {
511        if self.adam_v.is_none() {
512            self.adam_v = Some(Array1::zeros(self.parameters.len()));
513        }
514
515        let accumulated_grad = self.adam_v.as_mut().expect("operation should succeed");
516        let decay_rate = 0.9;
517        let epsilon = 1e-8;
518
519        for i in 0..self.parameters.len() {
520            accumulated_grad[i] =
521                decay_rate * accumulated_grad[i] + (1.0 - decay_rate) * gradient[i] * gradient[i];
522            self.parameters[i] -=
523                self.config.learning_rate * gradient[i] / (accumulated_grad[i].sqrt() + epsilon);
524        }
525
526        self.apply_bounds();
527        Ok(())
528    }
529
530    /// L-BFGS update (simplified version)
531    fn update_lbfgs(&mut self, gradient: &Array1<f64>) -> Result<()> {
532        // Simplified L-BFGS - just use gradient descent for now
533        for i in 0..self.parameters.len() {
534            self.parameters[i] -= self.config.learning_rate * gradient[i];
535        }
536        self.apply_bounds();
537        Ok(())
538    }
539
540    /// Apply parameter bounds
541    fn apply_bounds(&mut self) {
542        if let Some(bounds) = &self.parameter_bounds {
543            for i in 0..self.parameters.len() {
544                self.parameters[i] = self.parameters[i].max(bounds[[i, 0]]).min(bounds[[i, 1]]);
545            }
546        }
547    }
548
549    /// Update learning rate adaptively
550    fn update_learning_rate(&mut self, iteration: usize) {
551        if iteration > 0 {
552            let current_loss = self
553                .optimization_history
554                .last()
555                .expect("operation should succeed")
556                .0;
557            let previous_loss = self.optimization_history[self.optimization_history.len() - 2].0;
558
559            if current_loss > previous_loss {
560                // Decrease learning rate if loss increased
561                self.config.learning_rate *= self.config.learning_rate_decay;
562                self.config.learning_rate =
563                    self.config.learning_rate.max(self.config.min_learning_rate);
564            }
565        }
566    }
567
568    /// Compute kernel matrix
569    fn compute_kernel_matrix(&self, x: &Array2<f64>) -> Result<Array2<f64>> {
570        let n_samples = x.nrows();
571        let mut kernel_matrix = Array2::zeros((n_samples, n_samples));
572
573        // Assume RBF kernel with parameters[0] as gamma
574        let gamma = self.parameters[0];
575
576        for i in 0..n_samples {
577            for j in i..n_samples {
578                let dist_sq = x
579                    .row(i)
580                    .iter()
581                    .zip(x.row(j).iter())
582                    .map(|(&a, &b)| (a - b).powi(2))
583                    .sum::<f64>();
584
585                let kernel_value = (-gamma * dist_sq).exp();
586                kernel_matrix[[i, j]] = kernel_value;
587                kernel_matrix[[j, i]] = kernel_value;
588            }
589        }
590
591        Ok(kernel_matrix)
592    }
593
594    /// Compute kernel matrix derivatives
595    fn compute_kernel_derivatives(&self, x: &Array2<f64>) -> Result<Vec<Array2<f64>>> {
596        let n_samples = x.nrows();
597        let mut derivatives = Vec::new();
598
599        // Derivative with respect to gamma
600        let gamma = self.parameters[0];
601        let mut gamma_derivative = Array2::zeros((n_samples, n_samples));
602
603        for i in 0..n_samples {
604            for j in i..n_samples {
605                let dist_sq = x
606                    .row(i)
607                    .iter()
608                    .zip(x.row(j).iter())
609                    .map(|(&a, &b)| (a - b).powi(2))
610                    .sum::<f64>();
611
612                let kernel_value = (-gamma * dist_sq).exp();
613                let derivative_value = -dist_sq * kernel_value;
614
615                gamma_derivative[[i, j]] = derivative_value;
616                gamma_derivative[[j, i]] = derivative_value;
617            }
618        }
619
620        derivatives.push(gamma_derivative);
621
622        // Add derivatives for other parameters if needed
623        for _param_idx in 1..self.parameters.len() {
624            let derivative = Array2::zeros((n_samples, n_samples));
625            derivatives.push(derivative);
626        }
627
628        Ok(derivatives)
629    }
630
631    /// Compute kernel alignment
632    fn compute_kernel_alignment(&self, kernel_matrix: &Array2<f64>) -> f64 {
633        let n_samples = kernel_matrix.nrows();
634        let trace = (0..n_samples).map(|i| kernel_matrix[[i, i]]).sum::<f64>();
635        let frobenius_norm = kernel_matrix.iter().map(|&x| x * x).sum::<f64>().sqrt();
636
637        trace / frobenius_norm
638    }
639
640    /// Compute alignment derivative
641    fn compute_alignment_derivative(
642        &self,
643        kernel_matrix: &Array2<f64>,
644        kernel_derivative: &Array2<f64>,
645    ) -> f64 {
646        let n_samples = kernel_matrix.nrows();
647        let trace = (0..n_samples).map(|i| kernel_matrix[[i, i]]).sum::<f64>();
648        let trace_derivative = (0..n_samples)
649            .map(|i| kernel_derivative[[i, i]])
650            .sum::<f64>();
651
652        let frobenius_norm = kernel_matrix.iter().map(|&x| x * x).sum::<f64>().sqrt();
653        let frobenius_derivative = kernel_matrix
654            .iter()
655            .zip(kernel_derivative.iter())
656            .map(|(&k, &dk)| k * dk)
657            .sum::<f64>()
658            / frobenius_norm;
659
660        (trace_derivative * frobenius_norm - trace * frobenius_derivative)
661            / (frobenius_norm * frobenius_norm)
662    }
663
664    /// Split data for cross-validation
665    fn split_data(
666        &self,
667        x: &Array2<f64>,
668        y: &Array1<f64>,
669        start_idx: usize,
670        end_idx: usize,
671    ) -> (Array2<f64>, Array1<f64>, Array2<f64>, Array1<f64>) {
672        let n_samples = x.nrows();
673        let n_features = x.ncols();
674
675        let mut x_train = Array2::zeros((n_samples - (end_idx - start_idx), n_features));
676        let mut y_train = Array1::zeros(n_samples - (end_idx - start_idx));
677        let mut x_val = Array2::zeros((end_idx - start_idx, n_features));
678        let mut y_val = Array1::zeros(end_idx - start_idx);
679
680        let mut train_idx = 0;
681        let mut val_idx = 0;
682
683        for i in 0..n_samples {
684            if i >= start_idx && i < end_idx {
685                x_val.row_mut(val_idx).assign(&x.row(i));
686                y_val[val_idx] = y[i];
687                val_idx += 1;
688            } else {
689                x_train.row_mut(train_idx).assign(&x.row(i));
690                y_train[train_idx] = y[i];
691                train_idx += 1;
692            }
693        }
694
695        (x_train, y_train, x_val, y_val)
696    }
697
698    /// Compute fold gradient
699    fn compute_fold_gradient(
700        &self,
701        _x_train: &Array2<f64>,
702        _y_train: &Array1<f64>,
703        _x_val: &Array2<f64>,
704        _y_val: &Array1<f64>,
705    ) -> Result<GradientResult> {
706        // Simplified fold gradient computation
707        let gradient = Array1::zeros(self.parameters.len());
708        let objective_value = 0.0;
709
710        Ok(GradientResult {
711            gradient,
712            objective_value,
713            hessian: None,
714        })
715    }
716
717    /// Compute log marginal likelihood
718    fn compute_log_marginal_likelihood(
719        &self,
720        _kernel_matrix: &Array2<f64>,
721        _y: &Array1<f64>,
722    ) -> Result<f64> {
723        // Simplified log marginal likelihood
724        Ok(0.0)
725    }
726
727    /// Compute marginal likelihood derivative
728    fn compute_marginal_likelihood_derivative(
729        &self,
730        _kernel_matrix: &Array2<f64>,
731        _y: &Array1<f64>,
732        _kernel_derivative: &Array2<f64>,
733    ) -> Result<f64> {
734        // Simplified derivative computation
735        Ok(0.0)
736    }
737
738    /// Compute kernel ridge loss
739    fn compute_kernel_ridge_loss(
740        &self,
741        _kernel_matrix: &Array2<f64>,
742        _y: &Array1<f64>,
743    ) -> Result<f64> {
744        // Simplified kernel ridge loss
745        Ok(0.0)
746    }
747
748    /// Compute kernel ridge derivative
749    fn compute_kernel_ridge_derivative(
750        &self,
751        _kernel_matrix: &Array2<f64>,
752        _y: &Array1<f64>,
753        _kernel_derivative: &Array2<f64>,
754    ) -> Result<f64> {
755        // Simplified derivative computation
756        Ok(0.0)
757    }
758
759    /// Compute MMD
760    fn compute_mmd(&self, _x1: &ArrayView2<f64>, _x2: &ArrayView2<f64>) -> Result<f64> {
761        // Simplified MMD computation
762        Ok(0.0)
763    }
764
765    /// Compute MMD derivatives
766    fn compute_mmd_derivatives(
767        &self,
768        _x1: &ArrayView2<f64>,
769        _x2: &ArrayView2<f64>,
770    ) -> Result<Array1<f64>> {
771        // Simplified derivative computation
772        Ok(Array1::zeros(self.parameters.len()))
773    }
774
775    /// Compute target kernel matrix
776    fn compute_target_kernel(&self, y: &Array1<f64>) -> Array2<f64> {
777        let n_samples = y.len();
778        let mut target_kernel = Array2::zeros((n_samples, n_samples));
779
780        for i in 0..n_samples {
781            for j in 0..n_samples {
782                target_kernel[[i, j]] = y[i] * y[j];
783            }
784        }
785
786        target_kernel
787    }
788
789    /// Compute kernel target alignment
790    fn compute_kta(&self, kernel_matrix: &Array2<f64>, target_kernel: &Array2<f64>) -> f64 {
791        let numerator = kernel_matrix
792            .iter()
793            .zip(target_kernel.iter())
794            .map(|(&k, &t)| k * t)
795            .sum::<f64>();
796
797        let k_norm = kernel_matrix.iter().map(|&k| k * k).sum::<f64>().sqrt();
798        let t_norm = target_kernel.iter().map(|&t| t * t).sum::<f64>().sqrt();
799
800        numerator / (k_norm * t_norm)
801    }
802
803    /// Compute KTA derivative
804    fn compute_kta_derivative(
805        &self,
806        _kernel_matrix: &Array2<f64>,
807        _target_kernel: &Array2<f64>,
808        _kernel_derivative: &Array2<f64>,
809    ) -> f64 {
810        // Simplified KTA derivative
811        0.0
812    }
813
814    /// Get current parameters
815    pub fn get_parameters(&self) -> &Array1<f64> {
816        &self.parameters
817    }
818
819    /// Get optimization history
820    pub fn get_optimization_history(&self) -> &Vec<(f64, Array1<f64>)> {
821        &self.optimization_history
822    }
823}
824
825/// Gradient-based multi-kernel learning
826pub struct GradientMultiKernelLearner {
827    base_learners: Vec<GradientKernelLearner>,
828    combination_weights: Array1<f64>,
829    config: GradientConfig,
830}
831
832impl GradientMultiKernelLearner {
833    /// Create a new gradient-based multi-kernel learner
834    pub fn new(n_kernels: usize, n_parameters_per_kernel: usize) -> Self {
835        let mut base_learners = Vec::new();
836        for _ in 0..n_kernels {
837            base_learners.push(GradientKernelLearner::new(n_parameters_per_kernel));
838        }
839
840        Self {
841            base_learners,
842            combination_weights: Array1::from_elem(n_kernels, 1.0 / n_kernels as f64),
843            config: GradientConfig::default(),
844        }
845    }
846
847    /// Optimize all kernels and combination weights
848    pub fn optimize(&mut self, x: &Array2<f64>, y: Option<&Array1<f64>>) -> Result<()> {
849        // Optimize individual kernels
850        for learner in &mut self.base_learners {
851            learner.optimize(x, y)?;
852        }
853
854        // Optimize combination weights
855        self.optimize_combination_weights(x, y)?;
856
857        Ok(())
858    }
859
860    /// Optimize combination weights
861    fn optimize_combination_weights(
862        &mut self,
863        _x: &Array2<f64>,
864        _y: Option<&Array1<f64>>,
865    ) -> Result<()> {
866        // Simplified combination weight optimization
867        let n_kernels = self.base_learners.len();
868        self.combination_weights = Array1::from_elem(n_kernels, 1.0 / n_kernels as f64);
869        Ok(())
870    }
871
872    /// Get optimized parameters for all kernels
873    pub fn get_all_parameters(&self) -> Vec<&Array1<f64>> {
874        self.base_learners
875            .iter()
876            .map(|learner| learner.get_parameters())
877            .collect()
878    }
879
880    /// Get combination weights
881    pub fn get_combination_weights(&self) -> &Array1<f64> {
882        &self.combination_weights
883    }
884}
885
886#[allow(non_snake_case)]
887#[cfg(test)]
888mod tests {
889    use super::*;
890    use scirs2_core::ndarray::Array2;
891
892    #[test]
893    fn test_gradient_config() {
894        let config = GradientConfig::default();
895        assert_eq!(config.learning_rate, 0.01);
896        assert_eq!(config.max_iterations, 1000);
897        assert!(config.tolerance > 0.0);
898    }
899
900    #[test]
901    fn test_gradient_kernel_learner() {
902        let mut learner = GradientKernelLearner::new(2)
903            .with_optimizer(GradientOptimizer::Adam)
904            .with_objective(KernelObjective::KernelAlignment);
905
906        let x = Array2::from_shape_vec((4, 2), vec![1.0, 2.0, 2.0, 3.0, 3.0, 4.0, 4.0, 5.0])
907            .expect("operation should succeed");
908
909        learner.initialize_parameters(Array1::from_vec(vec![1.0, 0.5]));
910        let optimized_params = learner
911            .optimize(&x, None)
912            .expect("operation should succeed");
913
914        assert_eq!(optimized_params.len(), 2);
915    }
916
917    #[test]
918    fn test_gradient_optimizers() {
919        let optimizers = vec![
920            GradientOptimizer::SGD,
921            GradientOptimizer::Momentum,
922            GradientOptimizer::Adam,
923            GradientOptimizer::AdaGrad,
924            GradientOptimizer::RMSprop,
925        ];
926
927        for optimizer in optimizers {
928            let mut learner = GradientKernelLearner::new(1).with_optimizer(optimizer);
929
930            let x = Array2::from_shape_vec((3, 2), vec![1.0, 2.0, 2.0, 3.0, 3.0, 4.0])
931                .expect("operation should succeed");
932
933            learner.initialize_parameters(Array1::from_vec(vec![1.0]));
934            let result = learner.optimize(&x, None);
935            assert!(result.is_ok());
936        }
937    }
938
939    #[test]
940    fn test_parameter_bounds() {
941        let mut learner = GradientKernelLearner::new(2).with_bounds(
942            Array2::from_shape_vec(
943                (2, 2),
944                vec![
945                    0.1, 10.0, // Parameter 0: [0.1, 10.0]
946                    0.0, 5.0, // Parameter 1: [0.0, 5.0]
947                ],
948            )
949            .expect("operation should succeed"),
950        );
951
952        let x = Array2::from_shape_vec((4, 2), vec![1.0, 2.0, 2.0, 3.0, 3.0, 4.0, 4.0, 5.0])
953            .expect("operation should succeed");
954
955        learner.initialize_parameters(Array1::from_vec(vec![100.0, -1.0]));
956        let optimized_params = learner
957            .optimize(&x, None)
958            .expect("operation should succeed");
959
960        assert!(optimized_params[0] >= 0.1 && optimized_params[0] <= 10.0);
961        assert!(optimized_params[1] >= 0.0 && optimized_params[1] <= 5.0);
962    }
963
964    #[test]
965    fn test_multi_kernel_learner() {
966        let mut multi_learner = GradientMultiKernelLearner::new(3, 2);
967
968        let x = Array2::from_shape_vec((4, 2), vec![1.0, 2.0, 2.0, 3.0, 3.0, 4.0, 4.0, 5.0])
969            .expect("operation should succeed");
970
971        multi_learner
972            .optimize(&x, None)
973            .expect("operation should succeed");
974
975        let all_params = multi_learner.get_all_parameters();
976        assert_eq!(all_params.len(), 3);
977
978        let weights = multi_learner.get_combination_weights();
979        assert_eq!(weights.len(), 3);
980    }
981
982    #[test]
983    fn test_objective_functions() {
984        let objectives = vec![
985            KernelObjective::KernelAlignment,
986            KernelObjective::CrossValidationError,
987            KernelObjective::MarginalLikelihood,
988            KernelObjective::KernelRidgeLoss,
989            KernelObjective::MaximumMeanDiscrepancy,
990            KernelObjective::KernelTargetAlignment,
991        ];
992
993        for objective in objectives {
994            let mut learner = GradientKernelLearner::new(1).with_objective(objective.clone());
995
996            let x = Array2::from_shape_vec((4, 2), vec![1.0, 2.0, 2.0, 3.0, 3.0, 4.0, 4.0, 5.0])
997                .expect("operation should succeed");
998
999            let y = Array1::from_vec(vec![1.0, 0.0, 1.0, 0.0]);
1000
1001            learner.initialize_parameters(Array1::from_vec(vec![1.0]));
1002
1003            let result = if objective == KernelObjective::KernelAlignment
1004                || objective == KernelObjective::MaximumMeanDiscrepancy
1005            {
1006                learner.optimize(&x, None)
1007            } else {
1008                learner.optimize(&x, Some(&y))
1009            };
1010
1011            assert!(result.is_ok());
1012        }
1013    }
1014
1015    #[test]
1016    fn test_adaptive_learning_rate() {
1017        let config = GradientConfig {
1018            adaptive_learning_rate: true,
1019            learning_rate_decay: 0.5,
1020            min_learning_rate: 1e-6,
1021            ..Default::default()
1022        };
1023
1024        let mut learner = GradientKernelLearner::new(1).with_config(config);
1025
1026        let x = Array2::from_shape_vec((4, 2), vec![1.0, 2.0, 2.0, 3.0, 3.0, 4.0, 4.0, 5.0])
1027            .expect("operation should succeed");
1028
1029        learner.initialize_parameters(Array1::from_vec(vec![1.0]));
1030        let result = learner.optimize(&x, None);
1031        assert!(result.is_ok());
1032    }
1033}