use std::borrow::Cow;
use std::collections::{BTreeSet, HashMap, HashSet};
use std::iter;
use std::ops::Neg;
use nalgebra::{DMatrix, DVector};
pub use crate::error::{Error, InconsistentSlopes};
use crate::stats::students_t_cdf;
mod error;
mod stats;
#[cfg(test)]
mod tests;
macro_rules! ensure {
($predicate:expr, $error:expr) => {
if !$predicate {
return Err($error);
}
};
}
#[macro_export]
macro_rules! assert_almost_eq {
($a:expr, $b:expr) => {
$crate::assert_almost_eq!($a, $b, 1.0E-14);
};
($a:expr, $b:expr, $prec:expr) => {
if !$crate::almost_equal($a, $b, $prec) {
panic!("assert_almost_eq failed:\n{:?} vs\n{:?}", $a, $b);
}
};
}
#[macro_export]
macro_rules! assert_slices_almost_eq {
($a:expr, $b:expr) => {
$crate::assert_slices_almost_eq!($a, $b, 1.0E-14);
};
($a:expr, $b:expr, $prec:expr) => {
if !$crate::slices_almost_equal($a, $b, $prec) {
panic!("assert_slices_almost_eq failed:\n{:?} vs\n{:?}", $a, $b);
}
};
}
#[doc(hidden)]
pub fn almost_equal(a: f64, b: f64, precision: f64) -> bool {
if a.is_infinite() || b.is_infinite() || a.is_nan() || b.is_nan() {
false
} else {
(a - b).abs() <= precision
}
}
#[doc(hidden)]
pub fn slices_almost_equal(a: &[f64], b: &[f64], precision: f64) -> bool {
if a.len() != b.len() {
return false;
}
for (&x, &y) in a.iter().zip(b.iter()) {
if !almost_equal(x, y, precision) {
return false;
}
}
true
}
fn ulps_eq(a: f64, b: f64, epsilon: f64, max_ulps: u32) -> bool {
if (a - b).abs() <= epsilon {
return true;
}
if a.signum() != b.signum() {
return false;
}
let a: u64 = a.to_bits();
let b: u64 = b.to_bits();
a.abs_diff(b) <= max_ulps as u64
}
#[derive(Debug, Clone)]
pub struct FormulaRegressionBuilder<'a> {
data: Option<&'a RegressionData<'a>>,
formula: Option<Cow<'a, str>>,
columns: Option<(Cow<'a, str>, Vec<Cow<'a, str>>)>,
}
impl<'a> Default for FormulaRegressionBuilder<'a> {
fn default() -> Self {
FormulaRegressionBuilder::new()
}
}
impl<'a> FormulaRegressionBuilder<'a> {
pub fn new() -> Self {
FormulaRegressionBuilder {
data: None,
formula: None,
columns: None,
}
}
pub fn data(mut self, data: &'a RegressionData<'a>) -> Self {
self.data = Some(data);
self
}
pub fn formula<T: Into<Cow<'a, str>>>(mut self, formula: T) -> Self {
self.formula = Some(formula.into());
self
}
pub fn data_columns<I, S1, S2>(mut self, regressand: S1, regressors: I) -> Self
where
I: IntoIterator<Item = S2>,
S1: Into<Cow<'a, str>>,
S2: Into<Cow<'a, str>>,
{
let regressand = regressand.into();
let regressors: Vec<_> = regressors.into_iter().map(|i| i.into()).collect();
self.columns = Some((regressand, regressors));
self
}
pub fn fit(self) -> Result<RegressionModel, Error> {
let FittingData(input_vector, output_matrix, outputs) =
Self::get_matrices_and_regressor_names(self)?;
RegressionModel::try_from_matrices_and_regressor_names(input_vector, output_matrix, outputs)
}
pub fn fit_without_statistics(self) -> Result<Vec<f64>, Error> {
let FittingData(input_vector, output_matrix, _output_names) =
Self::get_matrices_and_regressor_names(self)?;
let low_level_result = fit_ols_pinv(input_vector, output_matrix)?;
let parameters = low_level_result.params;
Ok(parameters.iter().copied().collect())
}
fn get_matrices_and_regressor_names(self) -> Result<FittingData, Error> {
let (input, outputs) = self.get_data_columns()?;
let data = &self.data.ok_or(Error::NoData)?.data;
let input_vector: Vec<f64> = data
.get(&input)
.cloned()
.ok_or_else(|| Error::ColumnNotInData(input.into()))?;
let mut output_matrix = Vec::new();
output_matrix.resize(input_vector.len(), 1.);
for output in &outputs {
let output_vec = data
.get(output.as_ref())
.ok_or_else(|| Error::ColumnNotInData(output.to_string()))?;
ensure!(
output_vec.len() == input_vector.len(),
Error::RegressorRegressandDimensionMismatch(output.to_string())
);
output_matrix.extend_from_slice(output_vec);
}
let output_matrix = DMatrix::from_vec(input_vector.len(), outputs.len() + 1, output_matrix);
let outputs: Vec<_> = outputs.iter().map(|x| x.to_string()).collect();
Ok(FittingData(input_vector, output_matrix, outputs))
}
fn get_data_columns(&self) -> Result<(Cow<'_, str>, Vec<Cow<'_, str>>), Error> {
match (self.formula.as_ref(), self.columns.as_ref()) {
(Some(..), Some(..)) => Err(Error::BothFormulaAndDataColumnsGiven),
(Some(formula), None) => Self::parse_formula(formula),
(None, Some((regressand, regressors))) => {
ensure!(!regressors.is_empty(), Error::InvalidDataColumns);
Ok((regressand.clone(), regressors.clone()))
}
(None, None) => Err(Error::NoFormula),
}
}
fn parse_formula(formula: &str) -> Result<(Cow<'_, str>, Vec<Cow<'_, str>>), Error> {
let (input, outputs) = formula.split_once('~').ok_or(Error::InvalidFormula)?;
let input = input.trim();
let outputs: Vec<_> = outputs
.split('+')
.map(str::trim)
.filter(|x| !x.is_empty())
.map(|i| i.into())
.collect();
ensure!(!outputs.is_empty(), Error::InvalidFormula);
Ok((input.into(), outputs))
}
}
struct FittingData(Vec<f64>, DMatrix<f64>, Vec<String>);
#[derive(Debug, Clone)]
pub struct RegressionData<'a> {
data: HashMap<Cow<'a, str>, Vec<f64>>,
}
impl<'a> RegressionData<'a> {
fn new<I, S>(
data: I,
invalid_value_handling: InvalidValueHandling,
) -> Result<RegressionData<'a>, Error>
where
I: IntoIterator<Item = (S, Vec<f64>)>,
S: Into<Cow<'a, str>>,
{
let temp: HashMap<_, _> = data
.into_iter()
.map(|(key, value)| (key.into(), value))
.collect();
ensure!(
!temp.is_empty(),
Error::RegressionDataError("The data contains no columns.".into())
);
let mut len: Option<usize> = None;
for (key, val) in temp.iter() {
let this_len = val.len();
if len.is_none() {
len = Some(this_len);
}
ensure!(
this_len > 0,
Error::RegressionDataError("The data contains an empty column.".into())
);
ensure!(
Some(this_len) == len,
Error::RegressionDataError(
"The lengths of the columns in the given data are inconsistent.".into()
)
);
ensure!(
!key.contains('~') && !key.contains('+'),
Error::RegressionDataError(
"The column names may not contain `~` or `+`, because they are used \
as separators in the formula."
.into()
)
);
}
if Self::check_if_all_columns_are_equal(&temp) {
return Err(Error::RegressionDataError(
"All input columns contain only equal values. Fitting this model would lead \
to invalid statistics."
.into(),
));
}
if Self::check_if_data_is_valid(&temp) {
return Ok(Self { data: temp });
}
match invalid_value_handling {
InvalidValueHandling::ReturnError => Err(Error::RegressionDataError(
"The data contains a non real value (NaN or infinity or negative infinity). \
If you would like to silently drop these values configure the builder with \
InvalidValueHandling::DropInvalid."
.into(),
)),
InvalidValueHandling::DropInvalid => {
let temp = Self::drop_invalid_values(temp);
let first_key = temp.keys().next().expect("Cleaned data has no columns.");
let first_len = temp[first_key].len();
ensure!(
first_len > 0,
Error::RegressionDataError("The cleaned data is empty.".into())
);
Ok(Self { data: temp })
}
}
}
fn check_if_all_columns_are_equal(data: &HashMap<Cow<'a, str>, Vec<f64>>) -> bool {
for column in data.values() {
if column.is_empty() {
return false;
}
let first_iter = iter::repeat(&column[0]).take(column.len());
if !first_iter.eq(column.iter()) {
return false;
}
}
true
}
fn check_if_data_is_valid(data: &HashMap<Cow<'a, str>, Vec<f64>>) -> bool {
for column in data.values() {
if column.iter().any(|x| !x.is_finite()) {
return false;
}
}
true
}
fn drop_invalid_values(
data: HashMap<Cow<'a, str>, Vec<f64>>,
) -> HashMap<Cow<'a, str>, Vec<f64>> {
let mut invalid_rows: BTreeSet<usize> = BTreeSet::new();
for column in data.values() {
for (index, value) in column.iter().enumerate() {
if !value.is_finite() {
invalid_rows.insert(index);
}
}
}
let mut cleaned_data = HashMap::new();
for (key, mut column) in data {
for index in invalid_rows.iter().rev() {
column.remove(*index);
}
cleaned_data.insert(key, column);
}
cleaned_data
}
}
#[derive(Debug, Clone, Copy, Default)]
pub struct RegressionDataBuilder {
handle_invalid_values: InvalidValueHandling,
}
impl RegressionDataBuilder {
pub fn new() -> Self {
Self::default()
}
pub fn invalid_value_handling(mut self, setting: InvalidValueHandling) -> Self {
self.handle_invalid_values = setting;
self
}
pub fn build_from<'a, I, S>(self, data: I) -> Result<RegressionData<'a>, Error>
where
I: IntoIterator<Item = (S, Vec<f64>)>,
S: Into<Cow<'a, str>>,
{
RegressionData::new(data, self.handle_invalid_values)
}
}
#[derive(Debug, Clone, Copy, Default)]
#[non_exhaustive]
pub enum InvalidValueHandling {
#[default]
ReturnError,
DropInvalid,
}
#[derive(Debug, Clone)]
pub struct RegressionModel {
regressor_names: Vec<String>,
model: LowLevelRegressionModel,
}
impl RegressionModel {
#[inline]
pub fn regressor_names(&self) -> &[String] {
&self.regressor_names
}
#[inline]
pub fn p_values(&self) -> &[f64] {
self.model.p_values()
}
#[inline]
pub fn iter_p_value_pairs(&self) -> impl Iterator<Item = (&str, f64)> + '_ {
self.regressor_names
.iter()
.zip(self.model.p_values().iter().skip(1))
.map(|(r, &v)| (r.as_str(), v))
}
#[inline]
pub fn residuals(&self) -> &[f64] {
self.model.residuals()
}
#[inline]
pub fn parameters(&self) -> &[f64] {
self.model.parameters()
}
#[inline]
pub fn iter_parameter_pairs(&self) -> impl Iterator<Item = (&str, f64)> + '_ {
self.regressor_names
.iter()
.zip(self.model.parameters().iter().skip(1))
.map(|(r, &v)| (r.as_str(), v))
}
#[inline]
pub fn se(&self) -> &[f64] {
self.model.se()
}
#[inline]
pub fn iter_se_pairs(&self) -> impl Iterator<Item = (&str, f64)> + '_ {
self.regressor_names
.iter()
.zip(self.model.se().iter().skip(1))
.map(|(r, &v)| (r.as_str(), v))
}
#[inline]
pub fn ssr(&self) -> f64 {
self.model.ssr()
}
#[inline]
pub fn rsquared(&self) -> f64 {
self.model.rsquared()
}
#[inline]
pub fn rsquared_adj(&self) -> f64 {
self.model.rsquared_adj()
}
#[inline]
pub fn scale(&self) -> f64 {
self.model.scale()
}
pub fn predict<'a, I, S>(&self, new_data: I) -> Result<Vec<f64>, Error>
where
I: IntoIterator<Item = (S, Vec<f64>)>,
S: Into<Cow<'a, str>>,
{
let new_data: HashMap<Cow<'_, _>, Vec<f64>> = new_data
.into_iter()
.map(|(key, value)| (key.into(), value))
.collect();
self.check_variables(&new_data)?;
let input_len = new_data.values().next().unwrap().len();
let mut new_data_values: Vec<f64> = vec![];
for key in &self.regressor_names {
new_data_values.extend_from_slice(new_data[&Cow::from(key)].as_slice());
}
let num_regressors = self.model.parameters.len() - 1;
let new_data_matrix = DMatrix::from_vec(input_len, num_regressors, new_data_values);
let param_matrix = DMatrix::from_iterator(
num_regressors,
1,
self.model.parameters.iter().skip(1).copied(),
);
let intercept = self.model.parameters[0];
let intercept_matrix =
DMatrix::from_iterator(input_len, 1, std::iter::repeat(intercept).take(input_len));
let predictions = (new_data_matrix * param_matrix) + intercept_matrix;
let predictions: Vec<f64> = predictions.into_iter().copied().collect();
Ok(predictions)
}
fn check_variables(
&self,
given_parameters: &HashMap<Cow<'_, str>, Vec<f64>>,
) -> Result<(), Error> {
ensure!(!given_parameters.is_empty(), Error::NoData);
let first_len = given_parameters.values().next().unwrap().len();
ensure!(first_len > 0, Error::NoData);
let model_parameters: HashSet<_> = self.regressor_names.iter().map(Cow::from).collect();
for param in &model_parameters {
if !given_parameters.contains_key(param) {
return Err(Error::ColumnNotInData(param.to_string()));
}
}
for (param, values) in given_parameters {
ensure!(values.len() == first_len, Error::InconsistentVectors);
if !model_parameters.contains(param) {
return Err(Error::ModelColumnNotInData(param.to_string()));
}
}
Ok(())
}
fn try_from_matrices_and_regressor_names<I: IntoIterator<Item = String>>(
inputs: Vec<f64>,
outputs: DMatrix<f64>,
output_names: I,
) -> Result<Self, Error> {
let low_level_result = fit_ols_pinv(inputs, outputs)?;
let model = LowLevelRegressionModel::from_low_level_regression(low_level_result)?;
let regressor_names: Vec<String> = output_names.into_iter().collect();
let num_slopes = model.parameters.len() - 1;
ensure!(
regressor_names.len() == num_slopes,
Error::InconsistentSlopes(InconsistentSlopes::new(regressor_names.len(), num_slopes))
);
Ok(Self {
regressor_names,
model,
})
}
}
#[derive(Debug, Clone)]
pub struct LowLevelRegressionModel {
parameters: Vec<f64>,
se: Vec<f64>,
ssr: f64,
rsquared: f64,
rsquared_adj: f64,
pvalues: Vec<f64>,
residuals: Vec<f64>,
scale: f64,
}
impl LowLevelRegressionModel {
fn from_low_level_regression(
low_level_result: InternalLowLevelRegressionResult,
) -> Result<Self, Error> {
let parameters = low_level_result.params;
let singular_values = low_level_result.singular_values;
let normalized_cov_params = low_level_result.normalized_cov_params;
let diag = DMatrix::from_diagonal(&singular_values);
let rank = &diag.rank(0.0);
let input_vec = low_level_result.inputs.to_vec();
let input_matrix = DMatrix::from_vec(low_level_result.inputs.len(), 1, input_vec);
let residuals = &input_matrix - (low_level_result.outputs * parameters.to_owned());
let ssr = residuals.dot(&residuals);
let n = low_level_result.inputs.len();
let df_resid = n - rank;
ensure!(
df_resid >= 1,
Error::ModelFittingError(
"There are not enough residual degrees of freedom to perform statistics on this model".into()));
let scale = residuals.dot(&residuals) / df_resid as f64;
let cov_params = normalized_cov_params * scale;
let se = get_se_from_cov_params(&cov_params);
let mean = input_matrix.mean();
let mut centered_input_matrix = input_matrix;
subtract_value_from_matrix(&mut centered_input_matrix, mean);
let centered_tss = centered_input_matrix.dot(¢ered_input_matrix);
let rsquared = 1. - (ssr / centered_tss);
let rsquared_adj = 1. - ((n - 1) as f64 / df_resid as f64 * (1. - rsquared));
let tvalues = parameters
.iter()
.zip(se.iter())
.map(|(&x, &y)| x / y.max(std::f64::EPSILON));
let pvalues: Vec<f64> = tvalues
.map(|x| students_t_cdf(x.abs().neg(), df_resid as i64).map(|i| i * 2.))
.collect::<Option<_>>()
.ok_or_else(|| {
Error::ModelFittingError(
"Failed to calculate p-values: students_t_cdf failed due to invalid parameters"
.into(),
)
})?;
let parameters: Vec<f64> = parameters.iter().copied().collect();
let residuals: Vec<f64> = residuals.iter().copied().collect();
Ok(Self {
parameters,
se,
ssr,
rsquared,
rsquared_adj,
pvalues,
residuals,
scale,
})
}
#[inline]
pub fn p_values(&self) -> &[f64] {
&self.pvalues
}
#[inline]
pub fn residuals(&self) -> &[f64] {
&self.residuals
}
#[inline]
pub fn parameters(&self) -> &[f64] {
&self.parameters
}
#[inline]
pub fn se(&self) -> &[f64] {
&self.se
}
#[inline]
pub fn ssr(&self) -> f64 {
self.ssr
}
#[inline]
pub fn rsquared(&self) -> f64 {
self.rsquared
}
#[inline]
pub fn rsquared_adj(&self) -> f64 {
self.rsquared_adj
}
#[inline]
pub fn scale(&self) -> f64 {
self.scale
}
}
pub fn fit_low_level_regression_model(
data_row_major: &[f64],
num_rows: usize,
num_columns: usize,
) -> Result<LowLevelRegressionModel, Error> {
let regression = get_low_level_regression(data_row_major, num_rows, num_columns)?;
let model = LowLevelRegressionModel::from_low_level_regression(regression)?;
Ok(model)
}
pub fn fit_low_level_regression_model_without_statistics(
data_row_major: &[f64],
num_rows: usize,
num_columns: usize,
) -> Result<Vec<f64>, Error> {
let regression = get_low_level_regression(data_row_major, num_rows, num_columns)?;
Ok(regression.params.iter().copied().collect())
}
fn get_low_level_regression(
data_row_major: &[f64],
num_rows: usize,
num_columns: usize,
) -> Result<InternalLowLevelRegressionResult, Error> {
ensure!(
!data_row_major.is_empty() && num_rows * num_columns == data_row_major.len(),
Error::InconsistentVectors
);
let data = DMatrix::from_row_slice(num_rows, num_columns, data_row_major);
let inputs = data.view((0, 0), (num_rows, 1));
let inputs: Vec<f64> = inputs.iter().copied().collect();
let outputs: DMatrix<f64> = data.view((0, 1), (num_rows, num_columns - 1)).into_owned();
fit_ols_pinv(inputs, outputs)
}
#[derive(Debug, Clone)]
struct InternalLowLevelRegressionResult {
inputs: Vec<f64>,
outputs: DMatrix<f64>,
params: DMatrix<f64>,
singular_values: DVector<f64>,
normalized_cov_params: DMatrix<f64>,
}
fn fit_ols_pinv(
inputs: Vec<f64>,
outputs: DMatrix<f64>,
) -> Result<InternalLowLevelRegressionResult, Error> {
ensure!(
!inputs.is_empty(),
Error::ModelFittingError(
"Fitting the model failed because the input vector is empty".into()
)
);
ensure!(
outputs.nrows() >= 1 && outputs.ncols() >= 1,
Error::ModelFittingError(
"Fitting the model failed because the output matrix is empty".into()
)
);
let singular_values = outputs
.to_owned()
.try_svd(false, false, std::f64::EPSILON, 0)
.ok_or_else(|| {
Error::ModelFittingError(
"Computing the singular-value decomposition of the output matrix failed".into(),
)
})?
.singular_values;
let pinv = outputs.clone().pseudo_inverse(0.).map_err(|_| {
Error::ModelFittingError("Taking the pinv of the output matrix failed".into())
});
let pinv = pinv?;
let normalized_cov_params = &pinv * &pinv.transpose();
let params = get_sum_of_products(&pinv, &inputs);
ensure!(
params.len() >= 2,
Error::ModelFittingError("Invalid parameter matrix".into())
);
Ok(InternalLowLevelRegressionResult {
inputs,
outputs,
params,
singular_values,
normalized_cov_params,
})
}
fn subtract_value_from_matrix(matrix: &mut DMatrix<f64>, sub: f64) {
for i in matrix.iter_mut() {
*i -= sub;
}
}
fn get_se_from_cov_params(matrix: &DMatrix<f64>) -> Vec<f64> {
matrix
.row_iter()
.enumerate()
.map(|(n, row)| row.get(n).expect("BUG: Matrix is not square").sqrt())
.collect()
}
fn get_sum_of_products(matrix: &DMatrix<f64>, vector: &[f64]) -> DMatrix<f64> {
DMatrix::from_iterator(
matrix.nrows(),
1,
matrix
.row_iter()
.map(|row| row.iter().zip(vector.iter()).map(|(x, y)| x * y).sum()),
)
}