friedrich 0.6.0

Gaussian Process Regression.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
//! Gaussian process
//!
//! ```rust
//! # use friedrich::gaussian_process::GaussianProcess;
//! # use chrono::Duration;
//! // Trains a gaussian process on a dataset of one dimension vectors.
//! let training_inputs = vec![vec![0.8], vec![1.2], vec![3.8], vec![4.2]];
//! let training_outputs = vec![3.0, 4.0, -2.0, -2.0];
//! let mut gp = GaussianProcess::default(training_inputs, training_outputs);
//!
//! // Predicts the mean and variance of a single point.
//! let input = vec![1.];
//! let mean = gp.predict(&input);
//! let var = gp.predict_variance(&input);
//! println!("prediction: {} ± {}", mean, var.sqrt());
//!
//! // Makes several prediction.
//! let inputs = vec![vec![1.0], vec![2.0], vec![3.0]];
//! let outputs = gp.predict(&inputs);
//! println!("predictions: {:?}", outputs);
//!
//! // Updates the model.
//! let additional_inputs = vec![vec![0.], vec![1.], vec![2.], vec![5.]];
//! let additional_outputs = vec![2.0, 3.0, -1.0, -2.0];
//! let fit_prior = true;
//! let fit_kernel = true;
//! let max_iter = 100;
//! let convergence_fraction = 0.05;
//! let max_time = Duration::seconds(3600);
//! gp.add_samples(&additional_inputs, &additional_outputs);
//! gp.fit_parameters(fit_prior, fit_kernel, max_iter, convergence_fraction, max_time);
//!
//! // Samples from the distribution.
//! let new_inputs = vec![vec![1.0], vec![2.0]];
//! let sampler = gp.sample_at(&new_inputs);
//! let mut rng = rand::rng();
//! for i in 1..=5 {
//!   println!("sample {} : {:?}", i, sampler.sample(&mut rng));
//! }
//! ```

use crate::algebra::{add_rows_cholesky_cov_matrix, make_cholesky_cov_matrix, make_covariance_matrix, EMatrix,
                 EVector};
use crate::conversion::Input;
use crate::parameters::{kernel, kernel::Kernel, prior, prior::Prior};
use chrono::Duration;
use nalgebra::{Cholesky, DMatrix, DVector, Dyn};

mod multivariate_normal;
pub use multivariate_normal::MultivariateNormal;

mod builder;
pub use builder::GaussianProcessBuilder;

mod optimizer;

/// A Gaussian process that can be used to make predictions based on its training data
#[cfg_attr(feature = "friedrich_serde", derive(serde::Deserialize, serde::Serialize))]
pub struct GaussianProcess<KernelType: Kernel, PriorType: Prior>
{
    /// Value to which the process will regress in the absence of information.
    pub prior: PriorType,
    /// Kernel used to fit the process on the data.
    pub kernel: KernelType,
    /// Amplitude of the noise of the data as provided by the user or deduced by the optimizer.
    pub noise: f64,
    /// During Cholesky decomposition, this epsilon is used in place of the
    /// diagonal term if and only if the decomposition would otherwise fail.
    /// This is especially useful for noiseless sampling processes where very
    /// small covariance can result in numerical errors causing Cholesky to
    /// fail. See <https://github.com/nestordemeure/friedrich/issues/43> for
    /// details.
    pub cholesky_epsilon: Option<f64>,
    /// Data used for fit
    training_inputs: EMatrix,
    training_outputs: EVector,
    /// Cholesky decomposition of the covariance matrix trained on the current data points.
    covmat_cholesky: Cholesky<f64, Dyn>
}

impl GaussianProcess<kernel::Gaussian, prior::ConstantPrior>
{
    /// Returns a gaussian process with a Gaussian kernel and a constant prior, both fitted to the data.
    ///
    /// ```rust
    /// # use friedrich::gaussian_process::GaussianProcess;
    /// # fn main() {
    /// // training data
    /// let training_inputs = vec![vec![0.8], vec![1.2], vec![3.8], vec![4.2]];
    /// let training_outputs = vec![3.0, 4.0, -2.0, -2.0];
    ///
    /// // defining and training a model
    /// let gp = GaussianProcess::default(training_inputs, training_outputs);
    /// # }
    /// ```
    pub fn default<T: Input>(training_inputs: T, training_outputs: T::InVector) -> Self
    {
        GaussianProcessBuilder::<kernel::Gaussian, prior::ConstantPrior>::new(training_inputs,
                                                                              training_outputs).fit_kernel()
                                                                                               .fit_prior()
                                                                                               .train()
    }

    /// Returns a builder to define specific parameters of the gaussian process.
    ///
    /// ```rust
    /// # use friedrich::gaussian_process::GaussianProcess;
    /// # use friedrich::prior::*;
    /// # use friedrich::kernel::*;
    /// # fn main() {
    /// # // training data
    /// # let training_inputs = vec![vec![0.8], vec![1.2], vec![3.8], vec![4.2]];
    /// # let training_outputs = vec![3.0, 4.0, -2.0, -2.0];
    /// // model parameters
    /// let input_dimension = 1;
    /// let output_noise = 0.1;
    /// let exponential_kernel = Exponential::default();
    /// let linear_prior = LinearPrior::default(input_dimension);
    ///
    /// // defining and training a model
    /// let gp = GaussianProcess::builder(training_inputs, training_outputs).set_noise(output_noise)
    ///                                                                     .set_kernel(exponential_kernel)
    ///                                                                     .fit_kernel()
    ///                                                                     .set_prior(linear_prior)
    ///                                                                     .fit_prior()
    ///                                                                     .train();
    /// # }
    /// ```
    pub fn builder<T: Input>(training_inputs: T,
                             training_outputs: T::InVector)
                             -> GaussianProcessBuilder<kernel::Gaussian, prior::ConstantPrior>
    {
        GaussianProcessBuilder::<kernel::Gaussian, prior::ConstantPrior>::new(training_inputs,
                                                                              training_outputs)
    }
}

impl<KernelType: Kernel, PriorType: Prior> GaussianProcess<KernelType, PriorType>
{
    /// Raw method to create a new gaussian process with the given parameters / data.
    /// We recommend that you use either the default parameters or the builder to simplify the definition process.
    pub fn new<T: Input>(prior: PriorType,
                         kernel: KernelType,
                         noise: f64,
                         cholesky_epsilon: Option<f64>,
                         training_inputs: T,
                         training_outputs: T::InVector)
                         -> Self
    {
        assert!(noise >= 0., "The noise parameter should non-negative but we tried to set it to {}", noise);
        let training_inputs = T::into_dmatrix(training_inputs);
        let training_outputs = T::into_dvector(training_outputs);
        assert_eq!(training_inputs.nrows(), training_outputs.nrows());
        // converts training data into extendable matrix
        let training_inputs = EMatrix::new(training_inputs);
        let training_outputs = EVector::new(training_outputs - prior.prior(&training_inputs.as_matrix()));
        // computes cholesky decomposition
        let covmat_cholesky =
            make_cholesky_cov_matrix(&training_inputs.as_matrix(), &kernel, noise, cholesky_epsilon);
        GaussianProcess { prior,
                          kernel,
                          noise,
                          cholesky_epsilon,
                          training_inputs,
                          training_outputs,
                          covmat_cholesky }
    }

    /// Adds new samples to the model.
    ///
    /// Updates the model (which is faster than a retraining from scratch)
    /// but does not refit the parameters.
    pub fn add_samples<T: Input>(&mut self, inputs: &T, outputs: &T::InVector)
    {
        let inputs = T::to_dmatrix(inputs);
        let outputs = T::to_dvector(outputs);
        assert_eq!(inputs.nrows(), outputs.nrows());
        assert_eq!(inputs.ncols(), self.training_inputs.as_matrix().ncols());
        // grows the training matrix
        let outputs = outputs - self.prior.prior(&inputs);
        self.training_inputs.add_rows(&inputs);
        self.training_outputs.add_rows(&outputs);
        // add new rows to cholesky matrix
        let nb_new_inputs = inputs.nrows();
        add_rows_cholesky_cov_matrix(&mut self.covmat_cholesky,
                                     &self.training_inputs.as_matrix(),
                                     nb_new_inputs,
                                     &self.kernel,
                                     self.noise);
    }

    /// Computes the log likelihood of the current model given the training data.
    ///
    /// This quantity can be used for model selection.
    /// Given two models, the one with the highest score would be the one with the highest probability of producing the data.
    pub fn likelihood(&self) -> f64
    {
        // formula : -1/2 (transpose(output)*cov(train,train)^-1*output + trace(log|cov(train,train)|) + size(train)*log(2*pi))

        // How well do we fit the training data?
        let output = self.training_outputs.as_vector();
        // transpose(ol)*ol = transpose(output)*cov(train,train)^-1*output
        let ol = self.covmat_cholesky.l().solve_lower_triangular(&output).expect("likelihood : solve failed");
        let data_fit: f64 = ol.norm_squared();

        // penalizes complex models
        // recomputing kernels seems easier than extracting and squaring the diagonal of the cholesky matrix
        let complexity_penalty: f64 = self.training_inputs
                                          .as_matrix()
                                          .row_iter()
                                          .map(|r| self.kernel.kernel(&r, &r) + self.noise * self.noise)
                                          .map(|c| c.abs().ln())
                                          .sum();

        // rescales the output to make it independent of the number of samples
        let n = self.training_inputs.as_matrix().nrows();
        let normalization_constant = (n as f64) * (2. * std::f64::consts::PI).ln();

        -(data_fit + complexity_penalty + normalization_constant) / 2.
    }

    //----------------------------------------------------------------------------------------------
    // PREDICT

    /// Makes a prediction (the mean of the gaussian process) for each row of the input.
    pub fn predict<T: Input>(&self, inputs: &T) -> T::OutVector
    {
        // formula : prior + cov(input,train)*cov(train,train)^-1 * output

        let inputs = T::to_dmatrix(inputs);
        assert_eq!(inputs.ncols(), self.training_inputs.as_matrix().ncols());

        // computes weights to give each training sample
        let mut weights = make_covariance_matrix(&self.training_inputs.as_matrix(), &inputs, &self.kernel);
        self.covmat_cholesky.solve_mut(&mut weights);

        // computes prior for the given inputs
        let mut prior = self.prior.prior(&inputs);

        // weights.transpose() * &self.training_outputs + prior
        prior.gemm_tr(1f64, &weights, &self.training_outputs.as_vector(), 1f64);

        T::from_dvector(&prior)
    }

    /// Predicts the variance of the gaussian process for each row of the input.
    /// This quantity (and its square root) can be used as a proxy for the uncertainty of the prediction.
    pub fn predict_variance<T: Input>(&self, inputs: &T) -> T::OutVector
    {
        // formula, diagonal of : cov(input,input) - cov(input,train)*cov(train,train)^-1*cov(train,input)

        let inputs = T::to_dmatrix(inputs);
        assert_eq!(inputs.ncols(), self.training_inputs.as_matrix().ncols());

        // compute the covariances
        let cov_train_inputs =
            make_covariance_matrix(&self.training_inputs.as_matrix(), &inputs, &self.kernel);

        // solve linear system
        let kl = self.covmat_cholesky
                     .l()
                     .solve_lower_triangular(&cov_train_inputs)
                     .expect("predict_covariance : solve failed");

        // (cov_inputs_inputs - (kl.transpose() * kl)).diagonal()
        let variances = inputs.row_iter()
                              .map(|row| self.kernel.kernel(&row, &row)) // variance of input points with themselves
                              .zip(kl.column_iter().map(|col| col.norm_squared())) // diag(kl^T * kl)
                              .map(|(base_cov, predicted_cov)| base_cov - predicted_cov);
        let variances = DVector::<f64>::from_iterator(inputs.nrows(), variances);

        T::from_dvector(&variances)
    }

    /// Predicts both the mean and the variance of the gaussian process for each row of the input.
    ///
    /// Faster than calling `predict` and `predict_variance` separately.
    ///
    /// ```rust
    /// # use friedrich::gaussian_process::GaussianProcess;
    /// # fn main() {
    /// # let training_inputs = vec![vec![0.8], vec![1.2], vec![3.8], vec![4.2]];
    /// # let training_outputs = vec![3.0, 4.0, -2.0, -2.0];
    /// let gp = GaussianProcess::default(training_inputs, training_outputs);
    /// let input = vec![1.];
    /// let (mean, var) = gp.predict_mean_variance(&input);
    /// println!("prediction: {} ± {}", mean, var.sqrt());
    /// # }
    /// ```
    pub fn predict_mean_variance<T: Input>(&self, inputs: &T) -> (T::OutVector, T::OutVector)
    {
        let inputs = T::to_dmatrix(inputs);
        assert_eq!(inputs.ncols(), self.training_inputs.as_matrix().ncols());

        // computes weights to give each training sample
        let cov_train_inputs =
            make_covariance_matrix(&self.training_inputs.as_matrix(), &inputs, &self.kernel);
        let weights = self.covmat_cholesky.solve(&cov_train_inputs);

        // ----- mean -----

        // computes prior for the given inputs
        let mut prior = self.prior.prior(&inputs);

        // weights.transpose() * &self.training_outputs + prior
        prior.gemm_tr(1f64, &weights, &self.training_outputs.as_vector(), 1f64);

        let mean = T::from_dvector(&prior);

        // ----- variance -----

        // (cov_inputs_inputs - cov_train_inputs.transpose() * weights).diagonal()
        let mut variances = DVector::<f64>::zeros(inputs.nrows());
        for (i, input) in inputs.row_iter().enumerate()
        {
            let base_cov = self.kernel.kernel(&input, &input);
            let predicted_cov = cov_train_inputs.column(i).dot(&weights.column(i));
            variances[i] = base_cov - predicted_cov;
        }

        let variance = T::from_dvector(&variances);

        // ----- result -----

        (mean, variance)
    }

    /// Returns the covariance matrix for the rows of the input.
    pub fn predict_covariance<T: Input>(&self, inputs: &T) -> DMatrix<f64>
    {
        // formula : cov(input,input) - cov(input,train)*cov(train,train)^-1*cov(train,input)

        let inputs = T::to_dmatrix(inputs);
        assert_eq!(inputs.ncols(), self.training_inputs.as_matrix().ncols());

        // compute the covariances
        let cov_train_inputs =
            make_covariance_matrix(&self.training_inputs.as_matrix(), &inputs, &self.kernel);
        let mut cov_inputs_inputs = make_covariance_matrix(&inputs, &inputs, &self.kernel);

        // solve linear system
        let kl = self.covmat_cholesky
                     .l()
                     .solve_lower_triangular(&cov_train_inputs)
                     .expect("predict_covariance : solve failed");

        // cov_inputs_inputs - (kl.transpose() * kl)
        cov_inputs_inputs.gemm_tr(-1f64, &kl, &kl, 1f64);
        cov_inputs_inputs
    }

    /// Produces a multivariate gaussian that can be used to sample at the input points.
    ///
    /// The sampling requires a random number generator compatible with the [rand](https://crates.io/crates/rand) crate:
    ///
    /// ```rust
    /// # use friedrich::gaussian_process::GaussianProcess;
    /// # fn main() {
    /// # let training_inputs = vec![vec![0.8], vec![1.2], vec![3.8], vec![4.2]];
    /// # let training_outputs = vec![3.0, 4.0, -2.0, -2.0];
    /// # let gp = GaussianProcess::default(training_inputs, training_outputs);
    /// // computes the distribution at some new coordinates
    /// let new_inputs = vec![vec![1.], vec![2.]];
    /// let sampler = gp.sample_at(&new_inputs);
    ///
    /// // samples from the distribution
    /// let mut rng = rand::rng();
    /// println!("samples a vector : {:?}", sampler.sample(&mut rng));
    /// # }
    /// ```
    pub fn sample_at<T: Input>(&self, inputs: &T) -> MultivariateNormal<T>
    {
        let inputs = T::to_dmatrix(inputs);
        assert_eq!(inputs.ncols(), self.training_inputs.as_matrix().ncols());

        // compute the weights
        let cov_train_inputs =
            make_covariance_matrix(&self.training_inputs.as_matrix(), &inputs, &self.kernel);
        let weights = self.covmat_cholesky.solve(&cov_train_inputs);

        // computes covariance
        let mut cov_inputs_inputs = make_covariance_matrix(&inputs, &inputs, &self.kernel);
        cov_inputs_inputs.gemm_tr(-1f64, &cov_train_inputs, &weights, 1f64);
        let cov = cov_inputs_inputs;

        // computes the mean
        let mut prior = self.prior.prior(&inputs);
        prior.gemm_tr(1f64, &weights, &self.training_outputs.as_vector(), 1f64);
        let mean = prior;

        MultivariateNormal::new(mean, cov)
    }

    //----------------------------------------------------------------------------------------------
    // FIT

    /// Fits the requested parameters and retrains the model.
    ///
    /// The fit of the noise and kernel parameters is done by gradient descent.
    /// It runs for a maximum of `max_iter` iterations and stops prematurely if all gradients are below `convergence_fraction` time their associated parameter
    /// or if it runs for more than `max_time`.
    ///
    /// Good default values for `max_iter`, `convergence_fraction` and `max_time` are `100`, `0.05` and `chrono::Duration::seconds(3600)` (one hour)
    ///
    /// Note that, if the `noise` parameter ends up unnaturally large after the fit, it is a good sign that the kernel is unadapted to the data.
    pub fn fit_parameters(&mut self,
                          fit_prior: bool,
                          fit_kernel: bool,
                          max_iter: usize,
                          convergence_fraction: f64,
                          max_time: Duration)
    {
        if fit_prior
        {
            // Gets the original data back in order to update the prior.
            let training_outputs =
                self.training_outputs.as_vector() + self.prior.prior(&self.training_inputs.as_matrix());
            self.prior.fit(&self.training_inputs.as_matrix(), &training_outputs);
            let training_outputs = training_outputs - self.prior.prior(&self.training_inputs.as_matrix());
            self.training_outputs.assign(&training_outputs);
            // NOTE: Adding and subtracting each time we fit a prior might be numerically unwise.

            if !fit_kernel
            {
                // Retrains model from scratch.
                self.covmat_cholesky = make_cholesky_cov_matrix(&self.training_inputs.as_matrix(),
                                                                &self.kernel,
                                                                self.noise,
                                                                self.cholesky_epsilon);
            }
        }

        // Fit kernel and retrains model from scratch.
        if fit_kernel
        {
            if self.kernel.is_scalable()
            {
                self.scaled_optimize_parameters(max_iter, convergence_fraction, max_time);
            }
            else
            {
                self.optimize_parameters(max_iter, convergence_fraction, max_time);
            }
        }
    }
}