1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
//! Trait defining a generalized linear model for common functionality.
//! Models are fit such that $`<\mathbf{y}> = g^{-1}(\mathbf{X}*\mathbf{\beta})`$ where g is the
//! link function.
use crate::irls::IrlsStep;
use crate::link::{Link, Transform};
use crate::{
data::Dataset,
error::RegressionResult,
fit::{Fit, options::FitOptions},
irls::Irls,
model::Model,
num::Float,
};
use itertools::process_results;
use ndarray::{Array1, Array2, ArrayRef2};
use ndarray_linalg::SolveH;
/// Whether the model's response has a free dispersion parameter (e.g. linear) or if it is fixed to
/// one (e.g. logistic)
pub enum DispersionType {
FreeDispersion,
NoDispersion,
}
/// Trait describing generalized linear model that enables the IRLS algorithm
/// for fitting.
pub trait Glm: Sized {
/// The link function type of the GLM instantiation. Implementations specify
/// this manually so that the provided methods can be called in this trait
/// without necessitating a trait parameter.
type Link: Link<Self>;
/// Registers whether the dispersion is fixed at one (e.g. logistic) or free (e.g. linear)
const DISPERSED: DispersionType;
/// The link function which maps the expected value of the response variable
/// to the linear predictor.
fn link<F: Float>(y: Array1<F>) -> Array1<F> {
y.mapv(Self::Link::func)
}
/// The inverse of the link function which maps the linear predictors to the
/// expected value of the prediction.
fn mean<F: Float>(lin_pred: &Array1<F>) -> Array1<F> {
lin_pred.mapv(Self::Link::func_inv)
}
/// The logarithm of the partition function $`A(\eta)`$ in terms of the natural parameter.
/// This can be used to calculate the normalized likelihood.
fn log_partition<F: Float>(nat_par: F) -> F;
/// The variance function $`V(\mu)`$, the second derivative of the log-partition function
/// with respect to the natural parameter:
///
/// ```math
/// V(\mu) = \left.\frac{\partial^2 A(\eta)}{\partial \eta^2}\right|_{\eta = \eta(\mu)}
/// ```
///
/// This is unique to each response family and does not depend on the link function.
fn variance<F: Float>(mean: F) -> F;
/// Get the full adjusted variance diagonal from the linear predictors directly
fn adjusted_variance_diag<F: Float>(lin_pred: &Array1<F>) -> Array1<F> {
// The prediction of y given the current model.
let predictor: Array1<F> = Self::mean(lin_pred);
// The variances predicted by the model.
let var_diag: Array1<F> = predictor.mapv(Self::variance);
Self::Link::adjust_variance(var_diag, lin_pred)
}
/// Returns the likelihood function summed over all observations.
fn log_like<F>(data: &Dataset<F>, regressors: &Array1<F>) -> F
where
F: Float,
{
// the total likelihood prior to regularization
let terms = Self::log_like_terms(data, regressors);
let weighted_terms = data.apply_total_weights(terms);
weighted_terms.sum()
}
/// Per-observation log-likelihood as a function of the response $`y`$ and natural
/// parameter $`\eta`$:
///
/// ```math
/// l(y, \eta) = y\eta - A(\eta)
/// ```
///
/// Terms depending only on $`y`$ are dropped. The dispersion parameter is taken to be 1, as
/// it does not affect the IRLS steps. The default implementation can be overwritten for
/// performance or numerical accuracy, but should be mathematically equivalent.
fn log_like_natural<F>(y: F, nat: F) -> F
where
F: Float,
{
// subtracting the saturated likelihood to keep the likelihood closer to
// zero, but this can complicate some fit statistics. In addition to
// causing some null likelihood tests to fail as written, it would make
// the current deviance calculation incorrect.
y * nat - Self::log_partition(nat)
}
/// Returns the likelihood of a saturated model where every observation can
/// be fit exactly.
fn log_like_sat<F>(y: F) -> F
where
F: Float;
/// Returns the log-likelihood contributions for each observable given the regressor values.
/// It's assumed that these regresssors are in the same standardization scale as the dataset.
fn log_like_terms<F>(data: &Dataset<F>, regressors: &Array1<F>) -> Array1<F>
where
F: Float,
{
let lin_pred = data.linear_predictor_std(regressors);
let nat_par = Self::Link::nat_param(lin_pred);
// the likelihood prior to regularization
ndarray::Zip::from(&data.y)
.and(&nat_par)
.map_collect(|&y, &eta| Self::log_like_natural(y, eta))
}
/// Provide an initial guess for the parameters. This can be overridden
/// but this should provide a decent general starting point. The y data is
/// averaged with its mean to prevent infinities resulting from application
/// of the link function:
/// X * beta_0 ~ g(0.5*(y + y_avg))
/// This is equivalent to minimizing half the sum of squared differences
/// between X*beta and g(0.5*(y + y_avg)).
fn init_guess<F>(data: &Dataset<F>) -> Array1<F>
where
F: Float,
ArrayRef2<F>: SolveH<F>,
{
let y_bar: F = data.y.mean().unwrap_or_else(F::zero);
let mu_y: Array1<F> = data.y.mapv(|y| F::half() * (y + y_bar));
let link_y = mu_y.mapv(Self::Link::func);
// Compensate for linear offsets if they are present
let link_y: Array1<F> = if let Some(off) = &data.linear_offset {
&link_y - off
} else {
link_y
};
let x_mat: Array2<F> = data.x_conj().dot(&data.x);
let init_guess: Array1<F> = x_mat
.solveh_into(data.x_conj().dot(&link_y))
.unwrap_or_else(|err| {
eprintln!("WARNING: failed to get initial guess for IRLS. Will begin at zero.");
eprintln!("{err}");
Array1::<F>::zeros(data.x.ncols())
});
init_guess
}
/// Do the regression and return a result. Returns object holding fit result.
fn regression<F>(
model: &Model<Self, F>,
options: FitOptions<F>,
) -> RegressionResult<Fit<'_, Self, F>, F>
where
F: Float,
Self: Sized,
{
let initial: Array1<F> = options
.init_guess
.clone()
// The guess is given in external space, so it should be mapped to the internal before
// passing to IRLS.
.map(|b| model.data.transform_beta(b))
.unwrap_or_else(|| Self::init_guess(&model.data));
let mut irls: Irls<Self, F> = Irls::new(model, initial, options);
// Collect the best guess along the way so we don't have to loop back through to make sure
// we've got the optimum. The full history is stored in the IRLS structure.
let optimum: IrlsStep<F> = process_results(irls.by_ref(), |iter| {
iter.max_by(|a, b| a.like.total_cmp(&b.like))
})?
.unwrap_or_else(|| irls.make_last_step());
Ok(Fit::new(&model.data, optimum, irls))
}
}
#[cfg(test)]
mod tests {
use crate::{Linear, ModelBuilder};
use anyhow::Result;
use ndarray::{Array1, Array2};
/// Start with a trivially optimized model and make sure that the iteration returns a value
#[test]
fn returns_at_least_one() -> Result<()> {
let data_y = Array1::<f64>::zeros(4);
let data_x = Array2::<f64>::zeros((4, 2));
let model = ModelBuilder::<Linear>::data(&data_y, &data_x).build()?;
let fit = model.fit_options().l2_reg(1e-6).fit()?;
// These should be exactly equal, not approximately.
assert_eq!(&fit.result, Array1::<f64>::zeros(3));
Ok(())
}
}