Skip to main content

ndarray_glm/
model.rs

1//! Collect data for and configure a model
2
3use crate::{
4    error::{RegressionError, RegressionResult},
5    fit::{self, Fit},
6    glm::Glm,
7    math::is_rank_deficient,
8    num::Float,
9    response::Response,
10    utility::one_pad,
11};
12use fit::options::{FitConfig, FitOptions};
13use ndarray::{Array1, Array2, ArrayBase, ArrayView1, ArrayView2, Data, Ix1, Ix2};
14use std::marker::PhantomData;
15
16#[derive(Clone)]
17pub struct Dataset<F>
18where
19    F: Float,
20{
21    /// the observation of response data by event
22    pub y: Array1<F>,
23    /// the design matrix with events in rows and instances in columns
24    pub x: Array2<F>,
25    /// The offset in the linear predictor for each data point. This can be used
26    /// to fix the effect of control variables.
27    // TODO: Consider making this an option of a reference.
28    pub linear_offset: Option<Array1<F>>,
29    /// The variance weight of each observation (a.k.a. analytic weights)
30    pub weights: Option<Array1<F>>,
31    /// The frequency of each observation (traditionally positive integers)
32    pub freqs: Option<Array1<F>>,
33}
34
35impl<F> Dataset<F>
36where
37    F: Float,
38{
39    /// Returns the linear predictors, i.e. the design matrix multiplied by the
40    /// regression parameters. Each entry in the resulting array is the linear
41    /// predictor for a given observation. If linear offsets for each
42    /// observation are provided, these are added to the linear predictors
43    pub fn linear_predictor(&self, regressors: &Array1<F>) -> Array1<F> {
44        let linear_predictor: Array1<F> = self.x.dot(regressors);
45        // Add linear offsets to the predictors if they are set
46        if let Some(lin_offset) = &self.linear_offset {
47            linear_predictor + lin_offset
48        } else {
49            linear_predictor
50        }
51    }
52
53    /// Total number of observations as given by the sum of the frequencies of observations
54    pub fn n_obs(&self) -> F {
55        match &self.freqs {
56            None => F::from(self.y.len()).unwrap(),
57            Some(f) => f.sum(),
58        }
59    }
60
61    /// Returns the sum of the weights, or the number of observations if the weights are all equal
62    /// to 1.
63    pub(crate) fn sum_weights(&self) -> F {
64        match &self.weights {
65            None => self.n_obs(),
66            Some(w) => self.freq_sum(w),
67        }
68    }
69
70    /// Returns the effective sample size corrected for the design effect. This exposes the sum of
71    /// the squares of the variance weights.
72    pub fn n_eff(&self) -> F {
73        match &self.weights {
74            None => self.n_obs(),
75            Some(w) => {
76                let v1 = self.freq_sum(w);
77                let w2 = w * w;
78                let v2 = self.freq_sum(&w2);
79                v1 * v1 / v2
80            }
81        }
82    }
83
84    pub(crate) fn get_variance_weights(&self) -> Array1<F> {
85        match &self.weights {
86            Some(w) => w.clone(),
87            None => Array1::<F>::ones(self.y.len()),
88        }
89    }
90
91    /// Multiply the input by the frequency weights
92    pub(crate) fn apply_freq_weights(&self, rhs: Array1<F>) -> Array1<F> {
93        match &self.freqs {
94            None => rhs,
95            Some(f) => f * rhs,
96        }
97    }
98
99    /// multiply the input vector element-wise by the weights, if they exist
100    pub(crate) fn apply_total_weights(&self, rhs: Array1<F>) -> Array1<F> {
101        self.apply_freq_weights(self.apply_var_weights(rhs))
102    }
103
104    pub(crate) fn apply_var_weights(&self, rhs: Array1<F>) -> Array1<F> {
105        match &self.weights {
106            None => rhs,
107            Some(w) => w * rhs,
108        }
109    }
110
111    /// Sum over the input array using the frequencies (and not the variance weights) as weights.
112    /// This is a useful operation because the frequency weights fundamentally impact the sum
113    /// operator and nothing else.
114    pub(crate) fn freq_sum(&self, rhs: &Array1<F>) -> F {
115        self.apply_freq_weights(rhs.clone()).sum()
116    }
117
118    /// Return the weighted sum of the RHS, where both frequency and variance weights are used.
119    pub(crate) fn weighted_sum(&self, rhs: &Array1<F>) -> F {
120        self.freq_sum(&self.apply_var_weights(rhs.clone()))
121    }
122
123    /// Returns the weighted transpose of the feature data
124    pub(crate) fn x_conj(&self) -> Array2<F> {
125        let xt = self.x.t().to_owned();
126        let xt = match &self.freqs {
127            None => xt,
128            Some(f) => xt * f,
129        };
130        match &self.weights {
131            None => xt,
132            Some(w) => xt * w,
133        }
134    }
135}
136
137/// Holds the data and configuration settings for a regression.
138pub struct Model<M, F>
139where
140    M: Glm,
141    F: Float,
142{
143    pub(crate) model: PhantomData<M>,
144    /// The dataset
145    pub data: Dataset<F>,
146    /// Whether the intercept term is used (commonly true)
147    pub use_intercept: bool,
148}
149
150impl<M, F> Model<M, F>
151where
152    M: Glm,
153    F: Float,
154{
155    /// Perform the regression and return a fit object holding the results.
156    pub fn fit(&self) -> RegressionResult<Fit<'_, M, F>> {
157        self.fit_options().fit()
158    }
159
160    /// Fit options builder interface
161    pub fn fit_options(&self) -> FitConfig<'_, M, F> {
162        FitConfig {
163            model: self,
164            options: FitOptions::default(),
165        }
166    }
167
168    /// An experimental interface that would allow fit options to be set externally.
169    pub fn with_options(&self, options: FitOptions<F>) -> FitConfig<'_, M, F> {
170        FitConfig {
171            model: self,
172            options,
173        }
174    }
175}
176
177/// Provides an interface to create the full model option struct with convenient
178/// type inference.
179pub struct ModelBuilder<M: Glm> {
180    _model: PhantomData<M>,
181}
182
183impl<M: Glm> ModelBuilder<M> {
184    /// Borrow the Y and X data where each row in the arrays is a new
185    /// observation, and create the full model builder with the data to allow
186    /// for adjusting additional options.
187    pub fn data<'a, Y, F, YD, XD>(
188        data_y: &'a ArrayBase<YD, Ix1>,
189        data_x: &'a ArrayBase<XD, Ix2>,
190    ) -> ModelBuilderData<'a, M, Y, F>
191    where
192        Y: Response<M>,
193        F: Float,
194        YD: Data<Elem = Y>,
195        XD: Data<Elem = F>,
196    {
197        ModelBuilderData {
198            model: PhantomData,
199            data_y: data_y.view(),
200            data_x: data_x.view(),
201            linear_offset: None,
202            var_weights: None,
203            freq_weights: None,
204            use_intercept_term: true,
205            colin_tol: F::epsilon(),
206            error: None,
207        }
208    }
209}
210
211/// Holds the data and all the specifications for the model and provides
212/// functions to adjust the settings.
213pub struct ModelBuilderData<'a, M, Y, F>
214where
215    M: Glm,
216    Y: Response<M>,
217    F: 'static + Float,
218{
219    model: PhantomData<M>,
220    /// Observed response variable data where each entry is a new observation.
221    data_y: ArrayView1<'a, Y>,
222    /// Design matrix of observed covariate data where each row is a new
223    /// observation and each column represents a different dependent variable.
224    data_x: ArrayView2<'a, F>,
225    /// The offset in the linear predictor for each data point. This can be used
226    /// to incorporate control terms.
227    // TODO: consider making this a reference/ArrayView. Y and X are effectively
228    // cloned so perhaps this isn't a big deal.
229    linear_offset: Option<Array1<F>>,
230    /// The variance/analytic weights for each observation.
231    var_weights: Option<Array1<F>>,
232    /// The frequency/count of each observation.
233    freq_weights: Option<Array1<F>>,
234    /// Whether to use an intercept term. Defaults to `true`.
235    use_intercept_term: bool,
236    /// tolerance for determinant check on rank of data matrix X.
237    colin_tol: F,
238    /// An error that has come up in the build compilation.
239    error: Option<RegressionError>,
240}
241
242/// A builder to generate a Model object
243impl<'a, M, Y, F> ModelBuilderData<'a, M, Y, F>
244where
245    M: Glm,
246    Y: Response<M> + Copy,
247    F: Float,
248{
249    /// Represents an offset added to the linear predictor for each data point.
250    /// This can be used to control for fixed effects or in multi-level models.
251    pub fn linear_offset(mut self, linear_offset: Array1<F>) -> Self {
252        if self.linear_offset.is_some() {
253            self.error = Some(RegressionError::BuildError(
254                "Offsets specified multiple times".to_string(),
255            ));
256        }
257        self.linear_offset = Some(linear_offset);
258        self
259    }
260
261    /// Frequency weights (a.k.a. counts) for each observation. Traditionally these are positive
262    /// integers representing the number of times each observation appears identically.
263    pub fn freq_weights(mut self, freqs: Array1<usize>) -> Self {
264        if self.freq_weights.is_some() {
265            self.error = Some(RegressionError::BuildError(
266                "Frequency weights specified multiple times".to_string(),
267            ));
268        }
269        let ffreqs: Array1<F> = freqs.mapv(|c| F::from(c).unwrap());
270        // TODO: consider adding a check for non-negative weights
271        self.freq_weights = Some(ffreqs);
272        self
273    }
274
275    /// Variance weights (a.k.a. analytic weights) of each observation. These could represent the
276    /// inverse square of the uncertainties of each measurement.
277    pub fn var_weights(mut self, weights: Array1<F>) -> Self {
278        if self.var_weights.is_some() {
279            self.error = Some(RegressionError::BuildError(
280                "Variance weights specified multiple times".to_string(),
281            ));
282        }
283        // TODO: consider adding a check for non-negative weights
284        self.var_weights = Some(weights);
285        self
286    }
287
288    /// Do not add a constant term to the design matrix
289    pub fn no_constant(mut self) -> Self {
290        self.use_intercept_term = false;
291        self
292    }
293
294    /// Set the tolerance for the co-linearity check.
295    /// The check can be effectively disabled by setting the tolerance to a negative value.
296    pub fn colinear_tol(mut self, tol: F) -> Self {
297        self.colin_tol = tol;
298        self
299    }
300
301    pub fn build(self) -> RegressionResult<Model<M, F>>
302    where
303        M: Glm,
304        F: Float,
305    {
306        if let Some(err) = self.error {
307            return Err(err);
308        }
309
310        let n_data = self.data_y.len();
311        if n_data != self.data_x.nrows() {
312            return Err(RegressionError::BadInput(
313                "y and x data must have same number of points".to_string(),
314            ));
315        }
316        // If they are provided, check that the offsets have the correct number of entries
317        if let Some(lin_off) = &self.linear_offset {
318            if n_data != lin_off.len() {
319                return Err(RegressionError::BadInput(
320                    "Offsets must have same dimension as observations".to_string(),
321                ));
322            }
323        }
324
325        // add constant term to X data
326        let data_x = if self.use_intercept_term {
327            one_pad(self.data_x)
328        } else {
329            self.data_x.to_owned()
330        };
331        // Check if the data is under-constrained
332        if n_data < data_x.ncols() {
333            // The regression can find a solution if n_data == ncols, but there will be
334            // no estimate for the uncertainty. Regularization can solve this, so keep
335            // it to a warning.
336            // return Err(RegressionError::Underconstrained);
337            eprintln!("Warning: data is underconstrained");
338        }
339        // Check for co-linearity up to a tolerance
340        let xtx: Array2<F> = data_x.t().dot(&data_x);
341        if is_rank_deficient(xtx, self.colin_tol)? {
342            return Err(RegressionError::ColinearData);
343        }
344
345        // convert to floating-point
346        let data_y: Array1<F> = self
347            .data_y
348            .iter()
349            .map(|&y| y.into_float())
350            .collect::<Result<_, _>>()?;
351
352        Ok(Model {
353            model: PhantomData,
354            data: Dataset {
355                y: data_y,
356                x: data_x,
357                linear_offset: self.linear_offset,
358                weights: self.var_weights,
359                freqs: self.freq_weights,
360            },
361            use_intercept: self.use_intercept_term,
362        })
363    }
364}