ppca/
dataset.rs

1use bit_vec::BitVec;
2use nalgebra::DVector;
3use rayon::prelude::*;
4use serde_derive::{Deserialize, Serialize};
5use std::{ops::Index, sync::Arc};
6
7use crate::utils::Mask;
8
9/// A data sample with potentially missing values.
10#[derive(Debug, Clone, Serialize, Deserialize)]
11pub struct MaskedSample {
12    pub(crate) data: DVector<f64>,
13    pub(crate) mask: Mask,
14}
15
16impl MaskedSample {
17    /// Creates a masked sample from a vector, masking all elements which are not finite (e.g.,
18    /// `NaN` and `inf`).
19    pub fn mask_non_finite(data: DVector<f64>) -> MaskedSample {
20        let mask = data.iter().copied().map(f64::is_finite).collect::<BitVec>();
21        MaskedSample::new(data, Mask(mask))
22    }
23
24    /// Creates a masked sample from data and a mask. The value is considered missing if its index
25    /// in the masked is set to `false`.
26    pub fn new(data: DVector<f64>, mask: Mask) -> MaskedSample {
27        MaskedSample { data, mask }
28    }
29
30    /// Creates a sample without any masked values.
31    pub fn unmasked(data: DVector<f64>) -> MaskedSample {
32        MaskedSample {
33            mask: Mask::unmasked(data.len()),
34            data,
35        }
36    }
37
38    /// Returns the data vector associated with this sample.
39    pub fn data_vector(&self) -> DVector<f64> {
40        DVector::from(self.data.clone())
41    }
42
43    /// Returns `true` if all values are masked.
44    pub fn is_empty(&self) -> bool {
45        !self.mask.0.any()
46    }
47
48    /// Returns the mask of this sample. The value is considered missing if its index
49    /// in the masked is set to `false`.
50    pub fn mask(&self) -> &Mask {
51        &self.mask
52    }
53
54    /// Returns whether the `idx` dimension in this sample is set.
55    ///
56    /// # Panics
57    ///
58    /// This function panics if `idx` is out of bounds.
59    pub fn is_set(&self, idx: usize) -> bool {
60        self.mask.is_set(idx)
61    }
62
63    /// Returns the data vector associated with this sample, substituting all masked values by `NaN`.
64    pub fn masked_vector(&self) -> DVector<f64> {
65        self.data
66            .iter()
67            .copied()
68            .zip(&self.mask.0)
69            .map(|(value, selected)| if selected { value } else { f64::NAN })
70            .collect::<Vec<_>>()
71            .into()
72    }
73}
74
75impl Index<usize> for MaskedSample {
76    type Output = f64;
77    fn index(&self, index: usize) -> &Self::Output {
78        if self.is_set(index) {
79            &self.data[index]
80        } else {
81            panic!("Index out of bounds: index {index} is masked in sample")
82        }
83    }
84}
85
86/// Represents a dataset. This is a wrapper over a 2D array of dimensions
87/// `(n_samples, n_features)`.
88///
89/// ## Note
90///
91/// All arrays involved have to be of data type `float64`.
92#[derive(Debug, Clone, Serialize, Deserialize)]
93pub struct Dataset {
94    /// The data rows of this dataset.
95    pub data: Arc<Vec<MaskedSample>>,
96    /// The weights associated with each sample. Use this only if you are using the PPCA as a
97    /// component of a greater EM scheme (or otherwise know what you are doing). Else, let the
98    /// package set it automatically to 1.
99    pub weights: Vec<f64>,
100}
101
102impl From<Vec<MaskedSample>> for Dataset {
103    fn from(value: Vec<MaskedSample>) -> Self {
104        Dataset {
105            weights: vec![1.0; value.len()],
106            data: Arc::new(value),
107        }
108    }
109}
110
111impl FromIterator<MaskedSample> for Dataset {
112    fn from_iter<T>(iter: T) -> Self
113    where
114        T: IntoIterator<Item = MaskedSample>,
115    {
116        let data: Vec<_> = iter.into_iter().collect();
117        Self::new(data)
118    }
119}
120
121impl FromIterator<(MaskedSample, f64)> for Dataset {
122    fn from_iter<T>(iter: T) -> Self
123    where
124        T: IntoIterator<Item = (MaskedSample, f64)>,
125    {
126        let (data, weights): (Vec<_>, Vec<_>) = iter.into_iter().unzip();
127        Self::new_with_weights(data, weights)
128    }
129}
130
131impl FromParallelIterator<MaskedSample> for Dataset {
132    fn from_par_iter<T>(iter: T) -> Self
133    where
134        T: IntoParallelIterator<Item = MaskedSample>,
135    {
136        let data: Vec<_> = iter.into_par_iter().collect();
137        Self::new(data)
138    }
139}
140
141impl FromParallelIterator<(MaskedSample, f64)> for Dataset {
142    fn from_par_iter<T>(iter: T) -> Self
143    where
144        T: IntoParallelIterator<Item = (MaskedSample, f64)>,
145    {
146        let (data, weights): (Vec<_>, Vec<_>) = iter.into_par_iter().unzip();
147        Self::new_with_weights(data, weights)
148    }
149}
150
151impl Dataset {
152    /// Creates a new dataset from a set of masked samples.
153    pub fn new(data: Vec<MaskedSample>) -> Dataset {
154        Dataset {
155            weights: vec![1.0; data.len()],
156            data: Arc::new(data),
157        }
158    }
159
160    /// Creates a new dataset from a set of weighted masked samples.
161    pub fn new_with_weights(data: Vec<MaskedSample>, weights: Vec<f64>) -> Dataset {
162        assert_eq!(data.len(), weights.len());
163        Dataset {
164            data: Arc::new(data),
165            weights,
166        }
167    }
168
169    /// Creates a new dataset with the same sample, but with different weights. This operation is
170    /// cheap, since it does not clone the dataset (it's protected by an `Arc`).
171    pub fn with_weights(&self, weights: Vec<f64>) -> Dataset {
172        Dataset {
173            data: self.data.clone(),
174            weights,
175        }
176    }
177
178    /// The length of this dataset.
179    pub fn len(&self) -> usize {
180        self.data.len()
181    }
182
183    /// Whether this dataset is empty.
184    pub fn is_empty(&self) -> bool {
185        self.data.is_empty()
186    }
187
188    /// The number of dimensions in each sample. Returns `None` if dataset is empty.
189    pub fn output_size(&self) -> Option<usize> {
190        self.data.first().map(|sample| sample.mask().0.len())
191    }
192
193    /// Lists the dimensions which as masked in __all__ samples in this dataset.
194    pub fn empty_dimensions(&self) -> Vec<usize> {
195        let Some(n_dimensions) = self.data.first().map(|sample| sample.mask().0.len()) else {
196            return vec![]
197        };
198        let new_mask = || BitVec::from_elem(n_dimensions, false);
199        let poormans_or = |mut this: BitVec, other: &BitVec| {
200            for (position, is_selected) in other.iter().enumerate() {
201                if is_selected {
202                    this.set(position, true);
203                }
204            }
205            this
206        };
207
208        let is_not_empty_dimension = self
209            .data
210            .par_iter()
211            .fold(&new_mask, |buffer, sample| {
212                poormans_or(buffer, &sample.mask().0)
213            })
214            .reduce(&new_mask, |this, other| poormans_or(this, &other));
215
216        is_not_empty_dimension
217            .into_iter()
218            .enumerate()
219            .filter(|(_, is_not_empty)| !is_not_empty)
220            .map(|(dimension, _)| dimension)
221            .collect()
222    }
223}