ppca/
ppca_model.rs

1use bit_vec::BitVec;
2use nalgebra::{DMatrix, DVector};
3use rand::distributions::Distribution;
4use rand::Rng;
5use rand_distr::Bernoulli;
6use rayon::prelude::*;
7use serde_derive::{Deserialize, Serialize};
8use std::borrow::Cow;
9use std::sync::Arc;
10
11use crate::dataset::{Dataset, MaskedSample};
12use crate::output_covariance::OutputCovariance;
13use crate::prior::Prior;
14use crate::utils::{standard_noise, standard_noise_matrix, Mask};
15
16const LN_2PI: f64 = 1.8378770664093453;
17
18#[derive(Debug, Clone, Serialize, Deserialize)]
19pub struct PPCAModelInner {
20    output_covariance: OutputCovariance<'static>,
21    mean: DVector<f64>,
22}
23
24/// A PPCA model which can infer missing values.
25///
26/// Each sample for this model behaves according to the following
27/// statistical latent variable model.
28/// ```
29/// x ~ N(0; I(nxn))
30/// y = C * x + y0 + noise
31/// noise ~ N(0; sigma ^ 2 * I(mxm))
32/// ```
33/// Here, `x` is the latent state, y is the observed sample, that is an affine
34/// transformation of the hidden state contaminated by isotropic noise.
35///
36/// ## Note
37///
38/// All arrays involved have to be of data type `float64`.
39#[derive(Debug, Clone, Serialize, Deserialize)]
40pub struct PPCAModel(Arc<PPCAModelInner>);
41
42impl PPCAModel {
43    pub fn new(isotropic_noise: f64, transform: DMatrix<f64>, mean: DVector<f64>) -> PPCAModel {
44        PPCAModel(Arc::new(PPCAModelInner {
45            output_covariance: OutputCovariance::new_owned(isotropic_noise, transform),
46            mean,
47        }))
48    }
49
50    /// Creates a new random __untrained__ model from a given latent state size and a dataset.
51    pub fn init(state_size: usize, dataset: &Dataset) -> PPCAModel {
52        assert!(!dataset.is_empty());
53        let output_size = dataset.output_size().expect("dataset is not empty");
54        let empty_dimensions = dataset.empty_dimensions();
55        let mut rand_transform = standard_noise_matrix(output_size, state_size);
56
57        for (dimension, mut row) in rand_transform.row_iter_mut().enumerate() {
58            if empty_dimensions.contains(&dimension) {
59                row.fill(0.0);
60            }
61        }
62
63        PPCAModel(Arc::new(PPCAModelInner {
64            output_covariance: OutputCovariance {
65                isotropic_noise: 1.0,
66                transform: Cow::Owned(rand_transform),
67            },
68            mean: DVector::zeros(output_size),
69        }))
70    }
71
72    /// Then center of mass of the distribution in the output space.
73    pub fn mean(&self) -> &DVector<f64> {
74        &self.0.mean
75    }
76
77    /// The standard deviation of the noise in the output space.
78    pub fn isotropic_noise(&self) -> f64 {
79        self.0.output_covariance.isotropic_noise
80    }
81
82    /// The linear transformation from hidden state space to output space.
83    pub fn transform(&self) -> &DMatrix<f64> {
84        &self.0.output_covariance.transform
85    }
86
87    /// The number of features for this model.
88    pub fn output_size(&self) -> usize {
89        self.0.output_covariance.output_size()
90    }
91
92    /// The number of hidden values for this model.
93    pub fn state_size(&self) -> usize {
94        self.0.output_covariance.state_size()
95    }
96
97    /// Creates a zeroed `InferredMasked` struct that is compatible with this model.
98    pub fn uninferred(&self) -> InferredMasked {
99        InferredMasked {
100            model: self.clone(),
101            state: DVector::zeros(self.state_size()),
102            covariance: DMatrix::identity(self.state_size(), self.state_size()),
103        }
104    }
105
106    /// The total number of parameters involved in training (used for information criteria).
107    pub fn n_parameters(&self) -> usize {
108        1 + self.state_size() * self.output_size() + self.0.mean.nrows()
109    }
110
111    /// The relative strength of each hidden variable on the output. This is equivalent to the
112    /// eigenvalues in the standard PCA.
113    pub fn singular_values(&self) -> DVector<f64> {
114        self.0
115            .output_covariance
116            .transform
117            .column_iter()
118            .map(|column| column.norm().sqrt())
119            .collect::<Vec<_>>()
120            .into()
121    }
122
123    /// Compute the log-likelihood of a single sample.
124    pub fn llk_one(&self, sample: &MaskedSample) -> f64 {
125        let sample = if !sample.is_empty() {
126            sample
127        } else {
128            return 0.0;
129        };
130
131        let sub_sample = sample.mask.mask(&(sample.data_vector() - &self.0.mean));
132        let sub_covariance = self.0.output_covariance.masked(&sample.mask);
133
134        let llk = -sub_covariance.quadratic_form(&sub_sample) / 2.0
135            - sub_covariance.covariance_log_det() / 2.0
136            - LN_2PI / 2.0 * sub_covariance.output_size() as f64;
137
138        llk
139    }
140
141    /// Compute the log-likelihood of a given dataset.
142    pub fn llk(&self, dataset: &Dataset) -> f64 {
143        dataset
144            .data
145            .par_iter()
146            .zip(&dataset.weights)
147            .map(|(sample, weight)| self.llk_one(sample) * weight)
148            .sum()
149    }
150
151    /// Compute the log-likelihood for each sample in a given dataset.
152    pub fn llks(&self, dataset: &Dataset) -> DVector<f64> {
153        dataset
154            .data
155            .par_iter()
156            .map(|sample| self.llk_one(sample))
157            .collect::<Vec<_>>()
158            .into()
159    }
160
161    /// Sample a single sample from the PPCA model and masks each entry according to a
162    /// Bernoulli (coin-toss) distribution of probability `mask_prob` of erasing the
163    /// generated value.
164    pub fn sample_one(&self, mask_prob: f64) -> MaskedSample {
165        let sampled_state: DVector<f64> =
166            &*self.0.output_covariance.transform * standard_noise(self.state_size()) + &self.0.mean;
167        let noise: DVector<f64> =
168            self.0.output_covariance.isotropic_noise * standard_noise(self.output_size());
169        let mask = Mask(
170            Bernoulli::new(1.0 - mask_prob as f64)
171                .expect("invalid mask probability")
172                .sample_iter(rand::thread_rng())
173                .take(self.output_size())
174                .collect::<BitVec>(),
175        );
176
177        MaskedSample {
178            data: mask.fillna(&(sampled_state + noise)),
179            mask,
180        }
181    }
182
183    /// Sample a full dataset from the PPCA model and masks each entry according to a
184    /// Bernoulli (coin-toss) distribution of probability `mask_prob` of erasing the
185    /// generated value.
186    pub fn sample(&self, dataset_size: usize, mask_prob: f64) -> Dataset {
187        (0..dataset_size)
188            .into_par_iter()
189            .map(|_| self.sample_one(mask_prob))
190            .collect()
191    }
192
193    /// Infers the hidden components for one single sample. Use this method for
194    /// fine-grain control on the properties you want to extract from the model.
195    pub fn infer_one(&self, sample: &MaskedSample) -> InferredMasked {
196        if sample.is_empty() {
197            return self.uninferred();
198        }
199
200        let sub_sample = sample.mask.mask(&(sample.data_vector() - &self.0.mean));
201        let sub_covariance = self.0.output_covariance.masked(&sample.mask);
202
203        InferredMasked {
204            model: self.clone(),
205            state: sub_covariance.estimator_transform() * sub_sample,
206            covariance: sub_covariance.estimator_covariance(),
207        }
208    }
209
210    /// Builds a new [`InferredMasked`] from the raw values.
211    pub fn inferred_one(&self, state: DVector<f64>, covariance: DMatrix<f64>) -> InferredMasked {
212        InferredMasked {
213            model: self.clone(),
214            state,
215            covariance,
216        }
217    }
218
219    /// Infers the hidden components for each sample in the dataset. Use this method for
220    /// fine-grain control on the properties you want to extract from the model.
221    pub fn infer(&self, dataset: &Dataset) -> Vec<InferredMasked> {
222        dataset
223            .data
224            .par_iter()
225            .map(|sample| self.infer_one(sample))
226            .collect()
227    }
228
229    /// Filters a single samples, removing noise from the extant samples and
230    /// inferring the missing samples.
231    pub fn smooth_one(&self, sample: &MaskedSample) -> MaskedSample {
232        MaskedSample::unmasked(self.infer_one(sample).smoothed(&self))
233    }
234
235    /// Filters each sample of a given dataset, removing noise from the extant samples and
236    /// inferring the missing samples.
237    pub fn smooth(&self, dataset: &Dataset) -> Dataset {
238        dataset
239            .data
240            .par_iter()
241            .zip(&dataset.weights)
242            .map(|(sample, &weight)| (self.smooth_one(sample), weight))
243            .collect()
244    }
245
246    /// Extrapolates the missing values with the most probable values for a single sample. This
247    /// method does not alter the extant values.
248    pub fn extrapolate_one(&self, sample: &MaskedSample) -> MaskedSample {
249        MaskedSample::unmasked(self.infer_one(sample).extrapolated(self, sample))
250    }
251
252    /// Extrapolates the missing values with the most probable values for a full dataset. This
253    /// method does not alter the extant values.
254    pub fn extrapolate(&self, dataset: &Dataset) -> Dataset {
255        dataset
256            .data
257            .par_iter()
258            .zip(&dataset.weights)
259            .map(|(sample, &weight)| (self.extrapolate_one(sample), weight))
260            .collect()
261    }
262
263    /// Makes one iteration of the EM algorithm for the PPCA over an observed dataset,
264    /// returning the improved model. The log-likelihood will **always increase** for the
265    /// returned model.
266    #[must_use]
267    pub fn iterate(&self, dataset: &Dataset) -> PPCAModel {
268        self.iterate_with_prior(dataset, &Prior::default())
269    }
270
271    /// Makes one iteration of the EM algorithm for the PPCA over an observed dataset,
272    /// using a supplied PPCA prior and returning the improved model. This method will
273    /// not necessarily increase the log-likelihood of the returned model, but it will
274    /// return an improved _maximum a posteriori_ (MAP) estimate of the PPCA model
275    ///  according to the supplied prior.
276    #[must_use]
277    pub fn iterate_with_prior(&self, dataset: &Dataset, prior: &Prior) -> PPCAModel {
278        let inferred = self.infer(dataset);
279
280        // Updated transform:
281        let total_cross_moment = dataset
282            .data
283            .par_iter()
284            .zip(&dataset.weights)
285            .zip(&inferred)
286            .map(|((sample, &weight), inferred)| {
287                let centered_filled = sample.mask.fillna(&(sample.data_vector() - &self.0.mean));
288                weight * centered_filled * inferred.state.transpose()
289            })
290            .reduce(
291                || DMatrix::zeros(self.output_size(), self.state_size()),
292                |this, other| this + other,
293            ); // sum() no work...
294        let new_transform_rows = (0..self.output_size())
295            .into_par_iter()
296            .map(|idx| {
297                let total_second_moment = dataset
298                    .data
299                    .iter()
300                    .zip(&dataset.weights)
301                    .zip(&inferred)
302                    .filter(|((sample, _), _)| sample.mask.0[idx])
303                    .map(|((_, &weight), inferred)| weight * inferred.second_moment())
304                    // In case we get an empty dimension...
305                    .chain([DMatrix::zeros(self.state_size(), self.state_size())])
306                    .sum::<DMatrix<f64>>()
307                    + prior.transformation_precision()
308                        * DMatrix::<f64>::identity(self.state_size(), self.state_size());
309                let cross_moment_row = total_cross_moment.row(idx).transpose();
310                total_second_moment
311                    .qr()
312                    .solve(&cross_moment_row)
313                    .unwrap_or_else(|| {
314                        // Keep old row if you can't solve the linear system.
315                        self.0
316                            .output_covariance
317                            .transform
318                            .row(idx)
319                            .transpose()
320                            .clone_owned()
321                    })
322                    .transpose()
323            })
324            .collect::<Vec<_>>();
325        let new_transform = DMatrix::from_rows(&new_transform_rows);
326
327        // Updated isotropic noise:
328        let (square_error, deviations_square_sum, total_deviation, totals) = dataset
329            .data
330            .par_iter()
331            .zip(&dataset.weights)
332            .zip(&inferred)
333            .filter(|((sample, _), _)| !sample.is_empty())
334            .map(
335                |((sample, &weight), inferred)| {
336                let sub_covariance = self.0.output_covariance.masked(&sample.mask);
337                let sub_transform = &*sub_covariance.transform;
338                let deviation = sample.mask.fillna(
339                    &(sample.data_vector()
340                        - &*self.0.output_covariance.transform * &inferred.state
341                        - &self.0.mean),
342                );
343
344                (
345                    weight * (sub_transform * &inferred.covariance).dot(&sub_transform),
346                    weight * deviation.norm_squared(),
347                    weight * deviation,
348                    weight * sample.mask.as_vector(),
349                )
350            }).reduce_with(|
351                (square_error, deviation_square_sum, total_deviation, totals),
352                (square_error_, deviation_square_sum_, total_deviation_, totals_)| (
353                    square_error + square_error_,
354                    deviation_square_sum + deviation_square_sum_,
355                    total_deviation + total_deviation_,
356                    totals + totals_
357                )
358            ).expect("non-empty dataset");
359
360        let isotropic_noise_sq = if prior.has_isotropic_noise_prior() {
361            // Note: Inverse gamma inference here...
362            // Recall _mode_ for inverse gamma:
363            //     beta_post / (alpha_post + 1)
364            // And...
365            //     beta_post = [sum of all squarey stuff] / 2.0 + beta_prior
366            //     alpha_post = [number of samples] / 2.0 + alpha_prior
367            ((square_error + deviations_square_sum) / 2.0 + prior.isotropic_noise_beta())
368                / (totals.sum() / 2.0 + prior.isotropic_noise_alpha() + 1.0)
369        } else {
370            (square_error + deviations_square_sum) / totals.sum()
371        };
372
373        let mut new_mean =
374            total_deviation.zip_map(
375                &totals,
376                |sum, count| if count > 0.0 { sum / count } else { 0.0 },
377            ) + &self.0.mean;
378
379        if prior.has_mean_prior() {
380            new_mean = prior.smooth_mean(
381                new_mean,
382                DMatrix::from_diagonal(&totals) / isotropic_noise_sq,
383            );
384        }
385
386        PPCAModel(Arc::new(PPCAModelInner {
387            output_covariance: OutputCovariance {
388                transform: Cow::Owned(new_transform),
389                isotropic_noise: isotropic_noise_sq.sqrt(),
390            },
391            mean: new_mean,
392        }))
393    }
394
395    /// Returns a canonical version of this model. This does not alter the log-probability
396    /// function nor the quality of the training. All it does is to transform the hidden
397    /// variables.
398    pub fn to_canonical(&self) -> PPCAModel {
399        // Yes, we can have an empty state! In these case, there is nothing to be done.
400        if self.state_size() == 0 {
401            return self.clone();
402        }
403
404        let mut svd = self
405            .0
406            .output_covariance
407            .transform
408            .clone_owned()
409            .svd(true, false);
410        svd.v_t = Some(DMatrix::identity(self.state_size(), self.state_size()));
411
412        let mut new_transform = svd.recompose().expect("all matrices were calculated");
413        // Flip new transform
414        for mut column in new_transform.column_iter_mut() {
415            column *= column.sum().signum();
416        }
417
418        PPCAModel(Arc::new(PPCAModelInner {
419            output_covariance: OutputCovariance::new_owned(
420                self.0.output_covariance.isotropic_noise,
421                new_transform,
422            ),
423            mean: self.0.mean.clone(),
424        }))
425    }
426}
427
428/// The inferred probability distribution in the state space of a given sample.
429#[derive(Debug, Clone, Serialize, Deserialize)]
430pub struct InferredMasked {
431    model: PPCAModel,
432    state: DVector<f64>,
433    covariance: DMatrix<f64>,
434}
435
436impl InferredMasked {
437    pub(crate) fn second_moment(&self) -> DMatrix<f64> {
438        &self.state * self.state.transpose() + &self.covariance
439    }
440
441    /// The mean of the probability distribution in the state space.
442    pub fn state(&self) -> &DVector<f64> {
443        &self.state
444    }
445
446    /// The covariance matrix of the probability distribution in the state space. The covariances
447    /// here can change from sample to sample, depending on the mask. If there is lots of masking
448    /// in a sample, the covariance will be overall bigger.
449    pub fn covariance(&self) -> &DMatrix<f64> {
450        &self.covariance
451    }
452
453    /// The smoothed output values for a given output model.
454    pub fn smoothed(&self, ppca: &PPCAModel) -> DVector<f64> {
455        &*ppca.0.output_covariance.transform * self.state() + &ppca.0.mean
456    }
457
458    /// The extrapolated output values for a given output model and extant values in a given
459    /// sample.
460    pub fn extrapolated(&self, ppca: &PPCAModel, sample: &MaskedSample) -> DVector<f64> {
461        let filtered = self.smoothed(&ppca);
462        sample.mask.choose(&sample.data_vector(), &filtered)
463    }
464
465    /// The covariance for the smoothed output values.
466    ///
467    /// # Note:
468    ///
469    /// Afraid of the big, fat matrix? The method `output_covariance_diagonal` might just
470    /// save your life.
471    pub fn smoothed_covariance(&self, ppca: &PPCAModel) -> DMatrix<f64> {
472        let covariance = &ppca.0.output_covariance;
473
474        DMatrix::identity(covariance.output_size(), covariance.output_size())
475            * covariance.isotropic_noise.powi(2)
476            + &*covariance.transform * &self.covariance * covariance.transform.transpose()
477    }
478
479    /// Returns an _approximation_ of the smoothed output covariance matrix, treating each masked
480    /// output as an independent normal distribution.
481    ///
482    /// # Note:
483    ///
484    /// Use this not to get lost with big matrices in the output, losing CPU, memory and hair.
485    pub fn smoothed_covariance_diagonal(&self, ppca: &PPCAModel) -> DVector<f64> {
486        // Here, we will calculate `I sigma^2 + C Sxx C^T` for the unobserved samples in a
487        // clever way...
488
489        let covariance = &ppca.0.output_covariance;
490
491        // The `inner_inverse` part.
492        let transformed_state_covariance = &*covariance.transform * &self.covariance;
493
494        // Now comes the trick! Calculate only the diagonal elements of the
495        // `transformed_state_covariance * C^T` part.
496        let noiseless_output_diagonal = transformed_state_covariance
497            .row_iter()
498            .zip(covariance.transform.row_iter())
499            .map(|(row_left, row_right)| row_left.dot(&row_right));
500
501        // Finally, add the isotropic noise term...
502        noiseless_output_diagonal
503            .map(|noiseless_output_variance| {
504                noiseless_output_variance + covariance.isotropic_noise.powi(2)
505            })
506            .collect::<Vec<_>>()
507            .into()
508    }
509
510    /// The covariance for the extrapolated values for a given output model and extant values in a given
511    /// sample.
512    ///
513    /// # Note:
514    ///
515    /// Afraid of the big, fat matrix? The method `output_covariance_diagonal` might just
516    /// save your life.
517    pub fn extrapolated_covariance(&self, ppca: &PPCAModel, sample: &MaskedSample) -> DMatrix<f64> {
518        let negative = sample.mask().negate();
519
520        if !negative.0.any() {
521            return DMatrix::zeros(ppca.output_size(), ppca.output_size());
522        }
523
524        let sub_covariance = ppca.0.output_covariance.masked(&negative);
525
526        let output_covariance =
527            DMatrix::identity(sub_covariance.output_size(), sub_covariance.output_size())
528                * sub_covariance.isotropic_noise.powi(2)
529                + &*sub_covariance.transform
530                    * &self.covariance
531                    * sub_covariance.transform.transpose();
532
533        negative.expand_matrix(output_covariance)
534    }
535
536    /// Returns an _approximation_ of the extrapolated output covariance matrix, treating each masked
537    /// output as an independent normal distribution.
538    ///
539    /// # Note
540    ///
541    /// Use this not to get lost with big matrices in the output, losing CPU, memory and hair.
542    pub fn extrapolated_covariance_diagonal(
543        &self,
544        ppca: &PPCAModel,
545        sample: &MaskedSample,
546    ) -> DVector<f64> {
547        // Here, we will calculate `I sigma^2 + C Sxx C^T` for the unobserved samples in a
548        // clever way...
549
550        let negative = sample.mask().negate();
551
552        if !negative.0.any() {
553            return DVector::zeros(ppca.output_size());
554        }
555
556        let sub_covariance = ppca.0.output_covariance.masked(&negative);
557
558        // The `inner_inverse` part.
559        let transformed_state_covariance = &*sub_covariance.transform * &self.covariance;
560
561        // Now comes the trick! Calculate only the diagonal elements of the
562        // `transformed_state_covariance * C^T` part.
563        let noiseless_output_diagonal = transformed_state_covariance
564            .row_iter()
565            .zip(sub_covariance.transform.row_iter())
566            .map(|(row_left, row_right)| row_left.dot(&row_right));
567
568        // Finally, add the isotropic noise term...
569        let diagonal_reduced = noiseless_output_diagonal
570            .map(|noiseless_output_variance| {
571                noiseless_output_variance + sub_covariance.isotropic_noise.powi(2)
572            })
573            .collect::<Vec<_>>()
574            .into();
575
576        negative.expand(&diagonal_reduced)
577    }
578
579    /// Samples from the posterior distribution of an inferred sample. The sample is smoothed, that
580    /// is, it does not include the model isotropic noise.
581    pub fn posterior_sampler(&self) -> PosteriorSampler {
582        let cholesky = self
583            .covariance
584            .clone()
585            .cholesky()
586            .expect("Cholesky decomposition failed");
587        PosteriorSampler {
588            model: self.model.clone(),
589            state: self.state.clone(),
590            cholesky_l: cholesky.l(),
591        }
592    }
593}
594
595/// Samples from the posterior distribution of an inferred sample. The sample is smoothed, that
596/// is, it does not include the model isotropic noise.
597pub struct PosteriorSampler {
598    model: PPCAModel,
599    state: DVector<f64>,
600    cholesky_l: DMatrix<f64>,
601}
602
603impl Distribution<DVector<f64>> for PosteriorSampler {
604    fn sample<R>(&self, rng: &mut R) -> DVector<f64>
605    where
606        R: Rng + ?Sized,
607    {
608        // State noise:
609        let standard: DVector<f64> = (0..self.state.len())
610            .map(|_| rand_distr::StandardNormal.sample(rng))
611            .collect::<Vec<_>>()
612            .into();
613        // Output noise:
614        let noise: DVector<f64> = (0..self.model.output_size())
615            .map(|_| {
616                let standard: f64 = rand_distr::StandardNormal.sample(rng);
617                self.model.0.output_covariance.isotropic_noise * standard
618            })
619            .collect::<Vec<_>>()
620            .into();
621
622        noise
623            + self.model.mean()
624            + self.model.transform() * (&self.state + &self.cholesky_l * standard)
625    }
626}
627
628#[cfg(test)]
629mod test {
630    use bit_vec::BitVec;
631    use nalgebra::{dmatrix, dvector};
632
633    use super::*;
634
635    fn toy_model() -> PPCAModel {
636        PPCAModel::new(
637            0.1,
638            dmatrix![
639                1.0, 1.0, 0.0;
640                1.0, 0.0, 1.0;
641            ]
642            .transpose(),
643            dvector![0.0, 1.0, 0.0],
644        )
645    }
646
647    fn output_covariance() -> OutputCovariance<'static> {
648        OutputCovariance::new_owned(
649            0.1,
650            dmatrix![
651                1.0, 1.0, 0.0;
652                1.0, 0.0, 1.0;
653            ]
654            .transpose(),
655        )
656    }
657
658    #[test]
659    fn test_quadratic_form() {
660        let output_covariance = output_covariance();
661        approx::assert_relative_eq!(
662            output_covariance.quadratic_form(&dvector![1.0, 1.0, 1.0]),
663            34.219288
664        );
665    }
666
667    #[test]
668    fn test_covariance_log_det() {
669        let output_covariance = output_covariance();
670        approx::assert_relative_eq!(output_covariance.covariance_log_det(), -3.49328);
671    }
672
673    #[test]
674    fn test_llk() {
675        let model = toy_model();
676        dbg!(model.llk(&Dataset::new(vec![MaskedSample {
677            data: dvector![1.0, 2.0, 3.0],
678            mask: Mask(BitVec::from_elem(3, true)),
679        }])));
680    }
681}