sklears_gaussian_process/
kernel_optimization.rs

1//! Kernel parameter optimization
2//!
3//! This module provides functionality for optimizing kernel hyperparameters
4//! using gradient-based methods and other optimization strategies.
5
6use crate::kernels::Kernel;
7use crate::marginal_likelihood::log_marginal_likelihood;
8// SciRS2 Policy - Use scirs2-autograd for ndarray types and operations
9use scirs2_core::ndarray::{ArrayView1, ArrayView2};
10use sklears_core::error::{Result as SklResult, SklearsError};
11
12/// Kernel parameter optimizer
13#[derive(Debug, Clone)]
14pub struct KernelOptimizer {
15    /// Maximum number of optimization iterations
16    pub max_iter: usize,
17    /// Learning rate for gradient descent
18    pub learning_rate: f64,
19    /// Tolerance for convergence
20    pub tol: f64,
21    /// Whether to use line search
22    pub use_line_search: bool,
23    /// Number of random restarts
24    pub n_restarts: usize,
25    /// Bounds for parameters (optional)
26    pub bounds: Option<Vec<(f64, f64)>>,
27    /// Random state for reproducible results
28    pub random_state: Option<u64>,
29}
30
31impl Default for KernelOptimizer {
32    fn default() -> Self {
33        Self {
34            max_iter: 100,
35            learning_rate: 0.01,
36            tol: 1e-6,
37            use_line_search: true,
38            n_restarts: 5,
39            bounds: None,
40            random_state: Some(42),
41        }
42    }
43}
44
45/// Result of kernel parameter optimization
46#[derive(Debug, Clone)]
47pub struct OptimizationResult {
48    /// Optimized kernel parameters
49    pub optimized_params: Vec<f64>,
50    /// Final objective value (negative log marginal likelihood)
51    pub final_objective: f64,
52    /// Number of iterations used
53    pub n_iterations: usize,
54    /// Whether optimization converged
55    pub converged: bool,
56    /// Optimization history (objective values)
57    pub history: Vec<f64>,
58    /// Final gradient norm
59    pub final_gradient_norm: f64,
60}
61
62impl KernelOptimizer {
63    /// Create a new kernel optimizer
64    pub fn new() -> Self {
65        Self::default()
66    }
67
68    /// Set maximum number of iterations
69    pub fn max_iter(mut self, max_iter: usize) -> Self {
70        self.max_iter = max_iter;
71        self
72    }
73
74    /// Set learning rate
75    pub fn learning_rate(mut self, learning_rate: f64) -> Self {
76        self.learning_rate = learning_rate;
77        self
78    }
79
80    /// Set convergence tolerance
81    pub fn tol(mut self, tol: f64) -> Self {
82        self.tol = tol;
83        self
84    }
85
86    /// Set whether to use line search
87    pub fn use_line_search(mut self, use_line_search: bool) -> Self {
88        self.use_line_search = use_line_search;
89        self
90    }
91
92    /// Set number of random restarts
93    pub fn n_restarts(mut self, n_restarts: usize) -> Self {
94        self.n_restarts = n_restarts;
95        self
96    }
97
98    /// Set parameter bounds
99    pub fn bounds(mut self, bounds: Vec<(f64, f64)>) -> Self {
100        self.bounds = Some(bounds);
101        self
102    }
103
104    /// Set random state
105    pub fn random_state(mut self, random_state: Option<u64>) -> Self {
106        self.random_state = random_state;
107        self
108    }
109
110    /// Optimize kernel parameters
111    pub fn optimize_kernel(
112        &self,
113        kernel: &mut Box<dyn Kernel>,
114        X: ArrayView2<f64>,
115        y: ArrayView1<f64>,
116    ) -> SklResult<OptimizationResult> {
117        let mut best_result = None;
118        let mut best_objective = f64::INFINITY;
119
120        // Try multiple random restarts
121        for restart in 0..self.n_restarts {
122            let seed = self.random_state.map(|s| s + restart as u64);
123
124            // Initialize parameters (randomly for restarts > 0)
125            let initial_params = if restart == 0 {
126                kernel.get_params()
127            } else {
128                self.random_initialize_params(kernel, seed)?
129            };
130
131            // Optimize from this starting point
132            let result = self.optimize_from_initial(kernel, &X, &y, initial_params)?;
133
134            // Keep track of best result
135            if result.final_objective < best_objective {
136                best_objective = result.final_objective;
137                best_result = Some(result);
138            }
139        }
140
141        let final_result = best_result.ok_or_else(|| {
142            SklearsError::InvalidOperation("No optimization result found".to_string())
143        })?;
144
145        // Set the kernel to the best parameters found
146        kernel.set_params(&final_result.optimized_params)?;
147
148        Ok(final_result)
149    }
150
151    /// Optimize from initial parameters
152    fn optimize_from_initial(
153        &self,
154        kernel: &mut Box<dyn Kernel>,
155        X: &ArrayView2<f64>,
156        y: &ArrayView1<f64>,
157        initial_params: Vec<f64>,
158    ) -> SklResult<OptimizationResult> {
159        let mut params = initial_params;
160        let mut history = Vec::new();
161        let mut converged = false;
162
163        // Use Adam optimizer for better convergence
164        let mut m = vec![0.0; params.len()]; // First moment
165        let mut v = vec![0.0; params.len()]; // Second moment
166        let beta1 = 0.9;
167        let beta2 = 0.999;
168        let epsilon = 1e-8;
169
170        for iteration in 0..self.max_iter {
171            // Set current parameters
172            kernel.set_params(&params)?;
173
174            // Compute objective (negative log marginal likelihood) and gradient
175            let (objective, gradient) = self.compute_objective_and_gradient(kernel, X, y)?;
176            history.push(objective);
177
178            // Check convergence
179            let gradient_norm = gradient.iter().map(|x| x * x).sum::<f64>().sqrt();
180            if gradient_norm < self.tol {
181                converged = true;
182                break;
183            }
184
185            // Apply bounds if specified
186            let clipped_gradient = self.apply_bounds_to_gradient(&params, &gradient);
187
188            // Adam optimizer update
189            for i in 0..params.len() {
190                m[i] = beta1 * m[i] + (1.0 - beta1) * clipped_gradient[i];
191                v[i] = beta2 * v[i] + (1.0 - beta2) * clipped_gradient[i] * clipped_gradient[i];
192
193                let m_hat = m[i] / (1.0 - beta1.powi(iteration as i32 + 1));
194                let v_hat = v[i] / (1.0 - beta2.powi(iteration as i32 + 1));
195
196                let update = self.learning_rate * m_hat / (v_hat.sqrt() + epsilon);
197                params[i] -= update;
198            }
199
200            // Apply bounds to parameters
201            self.apply_bounds_to_params(&mut params);
202
203            // Line search if enabled
204            if self.use_line_search && iteration % 10 == 0 {
205                params = self.line_search(kernel, X, y, &params, &clipped_gradient)?;
206            }
207        }
208
209        let final_gradient = self.compute_gradient(kernel, X, y)?;
210        let final_gradient_norm = final_gradient.iter().map(|x| x * x).sum::<f64>().sqrt();
211
212        Ok(OptimizationResult {
213            optimized_params: params,
214            final_objective: history.last().copied().unwrap_or(f64::INFINITY),
215            n_iterations: history.len(),
216            converged,
217            history,
218            final_gradient_norm,
219        })
220    }
221
222    /// Compute objective and gradient
223    #[allow(non_snake_case)]
224    fn compute_objective_and_gradient(
225        &self,
226        kernel: &Box<dyn Kernel>,
227        X: &ArrayView2<f64>,
228        y: &ArrayView1<f64>,
229    ) -> SklResult<(f64, Vec<f64>)> {
230        // Compute negative log marginal likelihood
231        let X_owned = X.to_owned();
232        let y_owned = y.to_owned();
233        let neg_log_ml = -log_marginal_likelihood(&X_owned.view(), &y_owned.view(), kernel, 1e-6)?;
234
235        // Compute gradient using finite differences
236        let gradient = self.compute_gradient(kernel, X, y)?;
237
238        Ok((neg_log_ml, gradient))
239    }
240
241    /// Compute gradient using finite differences
242    #[allow(non_snake_case)]
243    fn compute_gradient(
244        &self,
245        kernel: &Box<dyn Kernel>,
246        X: &ArrayView2<f64>,
247        y: &ArrayView1<f64>,
248    ) -> SklResult<Vec<f64>> {
249        let params = kernel.get_params();
250        let mut gradient = vec![0.0; params.len()];
251        let h = 1e-6; // Step size for finite differences
252
253        let X_owned = X.to_owned();
254        let y_owned = y.to_owned();
255
256        // Central difference for each parameter
257        for i in 0..params.len() {
258            let mut params_plus = params.clone();
259            let mut params_minus = params.clone();
260
261            params_plus[i] += h;
262            params_minus[i] -= h;
263
264            // Create temporary kernels for evaluation
265            let mut kernel_plus = kernel.clone_box();
266            let mut kernel_minus = kernel.clone_box();
267
268            kernel_plus.set_params(&params_plus)?;
269            kernel_minus.set_params(&params_minus)?;
270
271            let f_plus =
272                -log_marginal_likelihood(&X_owned.view(), &y_owned.view(), &kernel_plus, 1e-6)?;
273            let f_minus =
274                -log_marginal_likelihood(&X_owned.view(), &y_owned.view(), &kernel_minus, 1e-6)?;
275
276            gradient[i] = (f_plus - f_minus) / (2.0 * h);
277        }
278
279        Ok(gradient)
280    }
281
282    /// Apply bounds to gradient (set to zero if at boundary)
283    fn apply_bounds_to_gradient(&self, params: &Vec<f64>, gradient: &Vec<f64>) -> Vec<f64> {
284        if let Some(bounds) = &self.bounds {
285            let mut clipped = gradient.clone();
286            for i in 0..params.len() {
287                if i < bounds.len() {
288                    let (lower, upper) = bounds[i];
289                    // If at lower bound and gradient is negative, set to zero
290                    if params[i] <= lower && gradient[i] < 0.0 {
291                        clipped[i] = 0.0;
292                    }
293                    // If at upper bound and gradient is positive, set to zero
294                    if params[i] >= upper && gradient[i] > 0.0 {
295                        clipped[i] = 0.0;
296                    }
297                }
298            }
299            clipped
300        } else {
301            gradient.clone()
302        }
303    }
304
305    /// Apply bounds to parameters
306    fn apply_bounds_to_params(&self, params: &mut Vec<f64>) {
307        if let Some(bounds) = &self.bounds {
308            for i in 0..params.len() {
309                if i < bounds.len() {
310                    let (lower, upper) = bounds[i];
311                    params[i] = params[i].max(lower).min(upper);
312                }
313            }
314        }
315    }
316
317    /// Simple line search (backtracking)
318    #[allow(non_snake_case)]
319    fn line_search(
320        &self,
321        kernel: &mut Box<dyn Kernel>,
322        X: &ArrayView2<f64>,
323        y: &ArrayView1<f64>,
324        params: &Vec<f64>,
325        direction: &Vec<f64>,
326    ) -> SklResult<Vec<f64>> {
327        let mut alpha = 1.0;
328        let rho = 0.5;
329        let c1 = 1e-4;
330
331        // Current objective
332        kernel.set_params(params)?;
333        let X_owned = X.to_owned();
334        let y_owned = y.to_owned();
335        let f0 = -log_marginal_likelihood(&X_owned.view(), &y_owned.view(), kernel, 1e-6)?;
336
337        // Directional derivative
338        let grad_f = self.compute_gradient(kernel, X, y)?;
339        let dir_deriv: f64 = grad_f
340            .iter()
341            .zip(direction.iter())
342            .map(|(g, d)| g * d)
343            .sum();
344
345        for _ in 0..20 {
346            // Maximum line search iterations
347            let new_params: Vec<f64> = params
348                .iter()
349                .zip(direction.iter())
350                .map(|(p, d)| p - alpha * d)
351                .collect();
352
353            // Apply bounds
354            let mut bounded_params = new_params;
355            self.apply_bounds_to_params(&mut bounded_params);
356
357            kernel.set_params(&bounded_params)?;
358            let f_new = -log_marginal_likelihood(&X_owned.view(), &y_owned.view(), kernel, 1e-6)?;
359
360            // Armijo condition
361            if f_new <= f0 + c1 * alpha * dir_deriv {
362                return Ok(bounded_params);
363            }
364
365            alpha *= rho;
366        }
367
368        // If line search fails, return original parameters
369        Ok(params.clone())
370    }
371
372    /// Random initialization of parameters for restarts
373    fn random_initialize_params(
374        &self,
375        kernel: &Box<dyn Kernel>,
376        seed: Option<u64>,
377    ) -> SklResult<Vec<f64>> {
378        let current_params = kernel.get_params();
379        let mut params = current_params.clone();
380
381        // Simple random perturbation (in practice, could use better strategies)
382        let scale = 0.5; // Perturbation scale
383        for i in 0..params.len() {
384            let noise = if let Some(s) = seed {
385                // Simple linear congruential generator for reproducibility
386                let a = 1664525u64;
387                let c = 1013904223u64;
388                let m = 2u64.pow(32);
389                let x = (a.wrapping_mul(s + i as u64).wrapping_add(c)) % m;
390                (x as f64 / m as f64 - 0.5) * 2.0 // Range [-1, 1]
391            } else {
392                0.0 // No randomization if no seed
393            };
394
395            params[i] *= 1.0 + scale * noise;
396
397            // Ensure parameters stay positive (common requirement for GP kernels)
398            params[i] = params[i].abs().max(1e-6);
399        }
400
401        // Apply bounds if specified
402        self.apply_bounds_to_params(&mut params);
403
404        Ok(params)
405    }
406}
407
408/// Optimize kernel parameters using gradient descent
409pub fn optimize_kernel_parameters(
410    kernel: &mut Box<dyn Kernel>,
411    X: ArrayView2<f64>,
412    y: ArrayView1<f64>,
413    optimizer_config: Option<KernelOptimizer>,
414) -> SklResult<OptimizationResult> {
415    let optimizer = optimizer_config.unwrap_or_default();
416    optimizer.optimize_kernel(kernel, X, y)
417}
418
419#[allow(non_snake_case)]
420#[cfg(test)]
421mod tests {
422    use super::*;
423    use crate::kernels::{Kernel, RBF};
424    // SciRS2 Policy - Use scirs2-autograd for ndarray types and operations
425    use scirs2_core::ndarray::{Array1, Array2};
426
427    #[test]
428    fn test_kernel_optimizer_creation() {
429        let optimizer = KernelOptimizer::new();
430        assert_eq!(optimizer.max_iter, 100);
431        assert_eq!(optimizer.n_restarts, 5);
432        assert!(optimizer.use_line_search);
433    }
434
435    #[test]
436    #[allow(non_snake_case)]
437    fn test_kernel_optimization() {
438        let optimizer = KernelOptimizer::new()
439            .max_iter(10)
440            .n_restarts(1)
441            .learning_rate(0.1);
442
443        // Create simple test data
444        let X = Array2::from_shape_vec((5, 1), vec![1.0, 2.0, 3.0, 4.0, 5.0]).unwrap();
445        let y = Array1::from_vec(vec![1.0, 4.0, 9.0, 16.0, 25.0]);
446
447        let mut kernel: Box<dyn Kernel> = Box::new(RBF::new(1.0));
448
449        let result = optimizer
450            .optimize_kernel(&mut kernel, X.view(), y.view())
451            .unwrap();
452
453        assert!(result.final_objective.is_finite());
454        assert_eq!(result.n_iterations, 10); // Should run full iterations for this test
455        assert!(!result.history.is_empty());
456    }
457
458    #[test]
459    fn test_bounds_application() {
460        let bounds = vec![(0.1, 10.0)];
461        let optimizer = KernelOptimizer::new().bounds(bounds);
462
463        let mut params = vec![-1.0]; // Below lower bound
464        optimizer.apply_bounds_to_params(&mut params);
465        assert_eq!(params[0], 0.1);
466
467        params[0] = 15.0; // Above upper bound
468        optimizer.apply_bounds_to_params(&mut params);
469        assert_eq!(params[0], 10.0);
470    }
471
472    #[test]
473    #[allow(non_snake_case)]
474    fn test_gradient_computation() {
475        let optimizer = KernelOptimizer::new();
476        let kernel: Box<dyn Kernel> = Box::new(RBF::new(1.0));
477
478        let X = Array2::from_shape_vec((3, 1), vec![1.0, 2.0, 3.0]).unwrap();
479        let y = Array1::from_vec(vec![1.0, 2.0, 3.0]);
480
481        let gradient = optimizer
482            .compute_gradient(&kernel, &X.view(), &y.view())
483            .unwrap();
484
485        assert_eq!(gradient.len(), 1); // RBF has one parameter
486        assert!(gradient[0].is_finite());
487    }
488
489    #[test]
490    fn test_random_initialization() {
491        let optimizer = KernelOptimizer::new();
492        let kernel: Box<dyn Kernel> = Box::new(RBF::new(1.0));
493
494        let params1 = optimizer
495            .random_initialize_params(&kernel, Some(42))
496            .unwrap();
497        let params2 = optimizer
498            .random_initialize_params(&kernel, Some(42))
499            .unwrap();
500        let params3 = optimizer
501            .random_initialize_params(&kernel, Some(43))
502            .unwrap();
503
504        // Same seed should give same result
505        assert_eq!(params1[0], params2[0]);
506        // Different seed should give different result
507        assert_ne!(params1[0], params3[0]);
508        // All parameters should be positive
509        assert!(params1[0] > 0.0);
510        assert!(params3[0] > 0.0);
511    }
512}