ndarray_glm/model.rs
1//! Collect data for and configure a model
2
3use crate::{
4 error::{RegressionError, RegressionResult},
5 fit::{self, Fit},
6 glm::Glm,
7 math::is_rank_deficient,
8 num::Float,
9 response::Response,
10 utility::one_pad,
11};
12use fit::options::{FitConfig, FitOptions};
13use ndarray::{Array1, Array2, ArrayBase, ArrayView1, ArrayView2, Data, Ix1, Ix2};
14use ndarray_linalg::InverseInto;
15use std::{
16 cell::{Ref, RefCell},
17 marker::PhantomData,
18};
19
20pub struct Dataset<F>
21where
22 F: Float,
23{
24 /// the observation of response data by event
25 pub y: Array1<F>,
26 /// the design matrix with events in rows and instances in columns
27 pub x: Array2<F>,
28 /// The offset in the linear predictor for each data point. This can be used
29 /// to fix the effect of control variables.
30 // TODO: Consider making this an option of a reference.
31 pub linear_offset: Option<Array1<F>>,
32 /// The weight of each observation
33 pub weights: Option<Array1<F>>,
34 /// The cached projection matrix
35 // crate-public only so that a null dataset can be created.
36 pub(crate) hat: RefCell<Option<Array2<F>>>,
37}
38
39impl<F> Dataset<F>
40where
41 F: Float,
42{
43 /// Returns the linear predictors, i.e. the design matrix multiplied by the
44 /// regression parameters. Each entry in the resulting array is the linear
45 /// predictor for a given observation. If linear offsets for each
46 /// observation are provided, these are added to the linear predictors
47 pub fn linear_predictor(&self, regressors: &Array1<F>) -> Array1<F> {
48 let linear_predictor: Array1<F> = self.x.dot(regressors);
49 // Add linear offsets to the predictors if they are set
50 if let Some(lin_offset) = &self.linear_offset {
51 linear_predictor + lin_offset
52 } else {
53 linear_predictor
54 }
55 }
56
57 /// Returns the hat matrix of the dataset of covariate data, also known as the "projection" or
58 /// "influence" matrix.
59 pub fn hat(&self) -> RegressionResult<Ref<Array2<F>>> {
60 if self.hat.borrow().is_none() {
61 if self.weights.is_some() {
62 unimplemented!("Weights must be accounted for in the hat matrix")
63 }
64 let xt = self.x.t();
65 let xtx: Array2<F> = xt.dot(&self.x);
66 // NOTE: invh/invh_into() are bugged and incorrect!
67 let xtx_inv = xtx.inv_into().map_err(|_| RegressionError::ColinearData)?;
68 *self.hat.borrow_mut() = Some(self.x.dot(&xtx_inv).dot(&xt));
69 }
70 let borrowed: Ref<Option<Array2<F>>> = self.hat.borrow();
71 Ok(Ref::map(borrowed, |x| x.as_ref().unwrap()))
72 }
73
74 /// Returns the leverage for each observation. This is given by the diagonal of the projection
75 /// matrix and indicates the sensitivity of each prediction to its corresponding observation.
76 pub fn leverage(&self) -> RegressionResult<Array1<F>> {
77 let hat = self.hat()?;
78 Ok(hat.diag().to_owned())
79 }
80}
81
82/// Holds the data and configuration settings for a regression.
83pub struct Model<M, F>
84where
85 M: Glm,
86 F: Float,
87{
88 pub(crate) model: PhantomData<M>,
89 /// The dataset
90 pub data: Dataset<F>,
91 /// Whether the intercept term is used (commonly true)
92 pub use_intercept: bool,
93}
94
95impl<M, F> Model<M, F>
96where
97 M: Glm,
98 F: Float,
99{
100 /// Perform the regression and return a fit object holding the results.
101 pub fn fit(&self) -> RegressionResult<Fit<M, F>> {
102 self.fit_options().fit()
103 }
104
105 /// Fit options builder interface
106 pub fn fit_options(&self) -> FitConfig<M, F> {
107 FitConfig {
108 model: self,
109 options: FitOptions::default(),
110 }
111 }
112
113 /// An experimental interface that would allow fit options to be set externally.
114 pub fn with_options(&self, options: FitOptions<F>) -> FitConfig<M, F> {
115 FitConfig {
116 model: self,
117 options,
118 }
119 }
120}
121
122/// Provides an interface to create the full model option struct with convenient
123/// type inference.
124pub struct ModelBuilder<M: Glm> {
125 _model: PhantomData<M>,
126}
127
128impl<M: Glm> ModelBuilder<M> {
129 /// Borrow the Y and X data where each row in the arrays is a new
130 /// observation, and create the full model builder with the data to allow
131 /// for adjusting additional options.
132 pub fn data<'a, Y, F, YD, XD>(
133 data_y: &'a ArrayBase<YD, Ix1>,
134 data_x: &'a ArrayBase<XD, Ix2>,
135 ) -> ModelBuilderData<'a, M, Y, F>
136 where
137 Y: Response<M>,
138 F: Float,
139 YD: Data<Elem = Y>,
140 XD: Data<Elem = F>,
141 {
142 ModelBuilderData {
143 model: PhantomData,
144 data_y: data_y.view(),
145 data_x: data_x.view(),
146 linear_offset: None,
147 weights: None,
148 use_intercept_term: true,
149 colin_tol: F::epsilon(),
150 }
151 }
152}
153
154/// Holds the data and all the specifications for the model and provides
155/// functions to adjust the settings.
156pub struct ModelBuilderData<'a, M, Y, F>
157where
158 M: Glm,
159 Y: Response<M>,
160 F: 'static + Float,
161{
162 model: PhantomData<M>,
163 /// Observed response variable data where each entry is a new observation.
164 data_y: ArrayView1<'a, Y>,
165 /// Design matrix of observed covariate data where each row is a new
166 /// observation and each column represents a different dependent variable.
167 data_x: ArrayView2<'a, F>,
168 /// The offset in the linear predictor for each data point. This can be used
169 /// to incorporate control terms.
170 // TODO: consider making this a reference/ArrayView. Y and X are effectively
171 // cloned so perhaps this isn't a big deal.
172 linear_offset: Option<Array1<F>>,
173 /// The weights for each observation.
174 weights: Option<Array1<F>>,
175 /// Whether to use an intercept term. Defaults to `true`.
176 use_intercept_term: bool,
177 /// tolerance for determinant check on rank of data matrix X.
178 colin_tol: F,
179}
180
181/// A builder to generate a Model object
182impl<'a, M, Y, F> ModelBuilderData<'a, M, Y, F>
183where
184 M: Glm,
185 Y: Response<M> + Copy,
186 F: Float,
187{
188 /// Represents an offset added to the linear predictor for each data point.
189 /// This can be used to control for fixed effects or in multi-level models.
190 pub fn linear_offset(mut self, linear_offset: Array1<F>) -> Self {
191 self.linear_offset = Some(linear_offset);
192 self
193 }
194
195 /// Do not add a constant term to the design matrix
196 pub fn no_constant(mut self) -> Self {
197 self.use_intercept_term = false;
198 self
199 }
200
201 /// Set the tolerance for the co-linearity check.
202 /// The check can be effectively disabled by setting the tolerance to a negative value.
203 pub fn colinear_tol(mut self, tol: F) -> Self {
204 self.colin_tol = tol;
205 self
206 }
207
208 pub fn build(self) -> RegressionResult<Model<M, F>>
209 where
210 M: Glm,
211 F: Float,
212 {
213 let n_data = self.data_y.len();
214 if n_data != self.data_x.nrows() {
215 return Err(RegressionError::BadInput(
216 "y and x data must have same number of points".to_string(),
217 ));
218 }
219 // If they are provided, check that the offsets have the correct number of entries
220 if let Some(lin_off) = &self.linear_offset {
221 if n_data != lin_off.len() {
222 return Err(RegressionError::BadInput(
223 "Offsets must have same dimension as observations".to_string(),
224 ));
225 }
226 }
227
228 // add constant term to X data
229 let data_x = if self.use_intercept_term {
230 one_pad(self.data_x)
231 } else {
232 self.data_x.to_owned()
233 };
234 // Check if the data is under-constrained
235 if n_data < data_x.ncols() {
236 // The regression can find a solution if n_data == ncols, but there will be
237 // no estimate for the uncertainty. Regularization can solve this, so keep
238 // it to a warning.
239 // return Err(RegressionError::Underconstrained);
240 eprintln!("Warning: data is underconstrained");
241 }
242 // Check for co-linearity up to a tolerance
243 let xtx: Array2<F> = data_x.t().dot(&data_x);
244 if is_rank_deficient(xtx, self.colin_tol)? {
245 return Err(RegressionError::ColinearData);
246 }
247
248 // convert to floating-point
249 let data_y: Array1<F> = self
250 .data_y
251 .iter()
252 .map(|&y| y.into_float())
253 .collect::<Result<_, _>>()?;
254
255 Ok(Model {
256 model: PhantomData,
257 data: Dataset {
258 y: data_y,
259 x: data_x,
260 linear_offset: self.linear_offset,
261 weights: self.weights,
262 hat: RefCell::new(None),
263 },
264 use_intercept: self.use_intercept_term,
265 })
266 }
267}