ndarray_glm/
model.rs

1//! Collect data for and configure a model
2
3use crate::{
4    error::{RegressionError, RegressionResult},
5    fit::{self, Fit},
6    glm::Glm,
7    math::is_rank_deficient,
8    num::Float,
9    response::Response,
10    utility::one_pad,
11};
12use fit::options::{FitConfig, FitOptions};
13use ndarray::{Array1, Array2, ArrayBase, ArrayView1, ArrayView2, Data, Ix1, Ix2};
14use ndarray_linalg::InverseInto;
15use std::{
16    cell::{Ref, RefCell},
17    marker::PhantomData,
18};
19
20pub struct Dataset<F>
21where
22    F: Float,
23{
24    /// the observation of response data by event
25    pub y: Array1<F>,
26    /// the design matrix with events in rows and instances in columns
27    pub x: Array2<F>,
28    /// The offset in the linear predictor for each data point. This can be used
29    /// to fix the effect of control variables.
30    // TODO: Consider making this an option of a reference.
31    pub linear_offset: Option<Array1<F>>,
32    /// The weight of each observation
33    pub weights: Option<Array1<F>>,
34    /// The cached projection matrix
35    // crate-public only so that a null dataset can be created.
36    pub(crate) hat: RefCell<Option<Array2<F>>>,
37}
38
39impl<F> Dataset<F>
40where
41    F: Float,
42{
43    /// Returns the linear predictors, i.e. the design matrix multiplied by the
44    /// regression parameters. Each entry in the resulting array is the linear
45    /// predictor for a given observation. If linear offsets for each
46    /// observation are provided, these are added to the linear predictors
47    pub fn linear_predictor(&self, regressors: &Array1<F>) -> Array1<F> {
48        let linear_predictor: Array1<F> = self.x.dot(regressors);
49        // Add linear offsets to the predictors if they are set
50        if let Some(lin_offset) = &self.linear_offset {
51            linear_predictor + lin_offset
52        } else {
53            linear_predictor
54        }
55    }
56
57    /// Returns the hat matrix of the dataset of covariate data, also known as the "projection" or
58    /// "influence" matrix.
59    pub fn hat(&self) -> RegressionResult<Ref<Array2<F>>> {
60        if self.hat.borrow().is_none() {
61            if self.weights.is_some() {
62                unimplemented!("Weights must be accounted for in the hat matrix")
63            }
64            let xt = self.x.t();
65            let xtx: Array2<F> = xt.dot(&self.x);
66            // NOTE: invh/invh_into() are bugged and incorrect!
67            let xtx_inv = xtx.inv_into().map_err(|_| RegressionError::ColinearData)?;
68            *self.hat.borrow_mut() = Some(self.x.dot(&xtx_inv).dot(&xt));
69        }
70        let borrowed: Ref<Option<Array2<F>>> = self.hat.borrow();
71        Ok(Ref::map(borrowed, |x| x.as_ref().unwrap()))
72    }
73
74    /// Returns the leverage for each observation. This is given by the diagonal of the projection
75    /// matrix and indicates the sensitivity of each prediction to its corresponding observation.
76    pub fn leverage(&self) -> RegressionResult<Array1<F>> {
77        let hat = self.hat()?;
78        Ok(hat.diag().to_owned())
79    }
80}
81
82/// Holds the data and configuration settings for a regression.
83pub struct Model<M, F>
84where
85    M: Glm,
86    F: Float,
87{
88    pub(crate) model: PhantomData<M>,
89    /// The dataset
90    pub data: Dataset<F>,
91    /// Whether the intercept term is used (commonly true)
92    pub use_intercept: bool,
93}
94
95impl<M, F> Model<M, F>
96where
97    M: Glm,
98    F: Float,
99{
100    /// Perform the regression and return a fit object holding the results.
101    pub fn fit(&self) -> RegressionResult<Fit<M, F>> {
102        self.fit_options().fit()
103    }
104
105    /// Fit options builder interface
106    pub fn fit_options(&self) -> FitConfig<M, F> {
107        FitConfig {
108            model: self,
109            options: FitOptions::default(),
110        }
111    }
112
113    /// An experimental interface that would allow fit options to be set externally.
114    pub fn with_options(&self, options: FitOptions<F>) -> FitConfig<M, F> {
115        FitConfig {
116            model: self,
117            options,
118        }
119    }
120}
121
122/// Provides an interface to create the full model option struct with convenient
123/// type inference.
124pub struct ModelBuilder<M: Glm> {
125    _model: PhantomData<M>,
126}
127
128impl<M: Glm> ModelBuilder<M> {
129    /// Borrow the Y and X data where each row in the arrays is a new
130    /// observation, and create the full model builder with the data to allow
131    /// for adjusting additional options.
132    pub fn data<'a, Y, F, YD, XD>(
133        data_y: &'a ArrayBase<YD, Ix1>,
134        data_x: &'a ArrayBase<XD, Ix2>,
135    ) -> ModelBuilderData<'a, M, Y, F>
136    where
137        Y: Response<M>,
138        F: Float,
139        YD: Data<Elem = Y>,
140        XD: Data<Elem = F>,
141    {
142        ModelBuilderData {
143            model: PhantomData,
144            data_y: data_y.view(),
145            data_x: data_x.view(),
146            linear_offset: None,
147            weights: None,
148            use_intercept_term: true,
149            colin_tol: F::epsilon(),
150        }
151    }
152}
153
154/// Holds the data and all the specifications for the model and provides
155/// functions to adjust the settings.
156pub struct ModelBuilderData<'a, M, Y, F>
157where
158    M: Glm,
159    Y: Response<M>,
160    F: 'static + Float,
161{
162    model: PhantomData<M>,
163    /// Observed response variable data where each entry is a new observation.
164    data_y: ArrayView1<'a, Y>,
165    /// Design matrix of observed covariate data where each row is a new
166    /// observation and each column represents a different dependent variable.
167    data_x: ArrayView2<'a, F>,
168    /// The offset in the linear predictor for each data point. This can be used
169    /// to incorporate control terms.
170    // TODO: consider making this a reference/ArrayView. Y and X are effectively
171    // cloned so perhaps this isn't a big deal.
172    linear_offset: Option<Array1<F>>,
173    /// The weights for each observation.
174    weights: Option<Array1<F>>,
175    /// Whether to use an intercept term. Defaults to `true`.
176    use_intercept_term: bool,
177    /// tolerance for determinant check on rank of data matrix X.
178    colin_tol: F,
179}
180
181/// A builder to generate a Model object
182impl<'a, M, Y, F> ModelBuilderData<'a, M, Y, F>
183where
184    M: Glm,
185    Y: Response<M> + Copy,
186    F: Float,
187{
188    /// Represents an offset added to the linear predictor for each data point.
189    /// This can be used to control for fixed effects or in multi-level models.
190    pub fn linear_offset(mut self, linear_offset: Array1<F>) -> Self {
191        self.linear_offset = Some(linear_offset);
192        self
193    }
194
195    /// Do not add a constant term to the design matrix
196    pub fn no_constant(mut self) -> Self {
197        self.use_intercept_term = false;
198        self
199    }
200
201    /// Set the tolerance for the co-linearity check.
202    /// The check can be effectively disabled by setting the tolerance to a negative value.
203    pub fn colinear_tol(mut self, tol: F) -> Self {
204        self.colin_tol = tol;
205        self
206    }
207
208    pub fn build(self) -> RegressionResult<Model<M, F>>
209    where
210        M: Glm,
211        F: Float,
212    {
213        let n_data = self.data_y.len();
214        if n_data != self.data_x.nrows() {
215            return Err(RegressionError::BadInput(
216                "y and x data must have same number of points".to_string(),
217            ));
218        }
219        // If they are provided, check that the offsets have the correct number of entries
220        if let Some(lin_off) = &self.linear_offset {
221            if n_data != lin_off.len() {
222                return Err(RegressionError::BadInput(
223                    "Offsets must have same dimension as observations".to_string(),
224                ));
225            }
226        }
227
228        // add constant term to X data
229        let data_x = if self.use_intercept_term {
230            one_pad(self.data_x)
231        } else {
232            self.data_x.to_owned()
233        };
234        // Check if the data is under-constrained
235        if n_data < data_x.ncols() {
236            // The regression can find a solution if n_data == ncols, but there will be
237            // no estimate for the uncertainty. Regularization can solve this, so keep
238            // it to a warning.
239            // return Err(RegressionError::Underconstrained);
240            eprintln!("Warning: data is underconstrained");
241        }
242        // Check for co-linearity up to a tolerance
243        let xtx: Array2<F> = data_x.t().dot(&data_x);
244        if is_rank_deficient(xtx, self.colin_tol)? {
245            return Err(RegressionError::ColinearData);
246        }
247
248        // convert to floating-point
249        let data_y: Array1<F> = self
250            .data_y
251            .iter()
252            .map(|&y| y.into_float())
253            .collect::<Result<_, _>>()?;
254
255        Ok(Model {
256            model: PhantomData,
257            data: Dataset {
258                y: data_y,
259                x: data_x,
260                linear_offset: self.linear_offset,
261                weights: self.weights,
262                hat: RefCell::new(None),
263            },
264            use_intercept: self.use_intercept_term,
265        })
266    }
267}