bigoish 0.1.3 - Docs.rs

//! Fit `(x, y)` pairs to a complexity model.

use rmpfit::{MPError, MPFitter, MPPar, MPResult};

use crate::model::*;

/// The real results that we're trying to fit a model to.
struct FitToResults<'a> {
    /// The model we're trying to fit:
    pub model: &'a dyn Model,
    /// Sizes of inputs to the real function.
    xs: &'a [f64],
    /// For each input size in `xs`, the corresponding cost (elapsed time, or
    /// CPU instructions, or some other cost measure) of calculating the result.
    ys: &'a [f64],
    /// The parameter bounds for fitting.
    param_bounds: Vec<MPPar>,
}

impl MPFitter for FitToResults<'_> {
    fn eval(&mut self, params: &[f64], deviates: &mut [f64]) -> MPResult<()> {
        for ((d, x), y) in deviates.iter_mut().zip(self.xs.iter()).zip(self.ys.iter()) {
            let f = if self.model.constant() {
                params[0]
            } else {
                params[0] + params[1] * self.model.complexity(*x)
            };
            // We could divide by a sigma, but we expect large values
            // (nanoseconds, CPU instructions), so no need to scale them up.
            *d = *y - f;
        }
        Ok(())
    }

    fn number_of_points(&self) -> usize {
        self.xs.len()
    }

    fn parameters(&self) -> Option<&[rmpfit::MPPar]> {
        Some(&self.param_bounds)
    }
}

#[derive(Debug)]
pub struct FittingError {
    error: MPError,
}

impl std::fmt::Display for FittingError {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "{}", self.error)
    }
}

/// Calculate Akaike infomation criteria (AIC); lower is better.
fn aic(num_parameters: usize, num_points: usize, rss: f64) -> f64 {
    let num_points = num_points as f64;
    let num_parameters = num_parameters as f64;
    2.0 * num_parameters + num_points * (rss / num_points).ln()
}

/// The result of fitting a model.
pub(crate) struct Fitting {
    /// The scaling parameters.
    pub scaling_params: Vec<f64>,
    /// The AIC; lower means better fit.
    pub aic: f64,
    /// The model used for the fitting.
    pub model: BoxedModel,
}

#[cfg(feature = "plots")]
impl Fitting {
    /// Use the found model and scaling parameters to calculate the scaled
    /// complexity for a given input size.
    pub(crate) fn scaled_complexity(&self, n: f64) -> f64 {
        if self.model.constant() {
            self.scaling_params[0]
        } else {
            self.scaling_params[0] + self.scaling_params[1] * self.model.complexity(n)
        }
    }
}

impl std::fmt::Debug for Fitting {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.write_fmt(format_args!(
            "Fitting(model={:?}, params={:?}, aic={})",
            self.model.to_string(),
            self.scaling_params,
            self.aic
        ))
    }
}

/// Fit a model to matched real inputs and outputs.
///
/// The parameters that are being fit are `a`, `b`, and `extra_params`, for the
/// function `a + b * model.complexity(x, extra_params)` with the exception that
/// for `Constant` it's just matched to a single parameter `a`). `a` and `b`
/// will be stored in the result's `scaling_params`.
fn fit(model: BoxedModel, xs: &[f64], ys: &[f64]) -> Result<Fitting, FittingError> {
    assert_eq!(xs.len(), ys.len());
    let mut fitter = FitToResults {
        model: &model as &dyn Model,
        xs,
        ys,
        param_bounds: vec![],
    };
    let num_params = if fitter.model.constant() { 1 } else { 2 };
    for _ in 0..num_params {
        let param = MPPar {
            limited_low: true,
            limit_low: 0.0,
            ..Default::default()
        };
        fitter.param_bounds.push(param)
    }

    let mut params = vec![1.0; num_params];
    match fitter.mpfit(&mut params) {
        Ok(status) => Ok(Fitting {
            scaling_params: params,
            aic: aic(
                num_params,
                xs.len(),
                status.resid.iter().map(|x| x * x).sum(),
            ),
            model,
        }),
        Err(error) => Err(FittingError { error }),
    }
}

/// Check if there's a better fit in known default models.
///
/// `Ok` implies the given model was the best.
fn expected_or_other_best_fit<M: Model + 'static>(
    expected_model: M,
    xs: &[f64],
    ys: &[f64],
    known_models: KnownModels,
) -> Result<Fitting, Fitting> {
    // None means expected_fitting is best:
    let mut expected_fitting_is_best = true;
    let expected_model = BoxedModel::new(expected_model);
    let mut best_fitting = fit(expected_model, xs, ys).expect("Failed to fit to the model.");
    for model in known_models.into_iter() {
        if let Ok(fitting) = fit(model, xs, ys) {
            if fitting.aic < best_fitting.aic {
                expected_fitting_is_best = false;
                best_fitting = fitting;
            }
        };
    }

    if expected_fitting_is_best {
        Ok(best_fitting)
    } else {
        Err(best_fitting)
    }
}

/// Check if the measured complexity (`ys`) goes up faster than the modeled
/// complexity from the fitted model. If it does, it's probably a bad model.
fn validate_fit(fitting: &Fitting, xs: &[f64], ys: &[f64]) -> bool {
    match fit(BoxedModel::new(Log(N) * fitting.model.clone()), xs, ys) {
        Ok(other_fitting) => other_fitting.aic > fitting.aic,
        Err(_) => true,
    }
}

/// Which fit is the best?
#[derive(Debug)]
pub(crate) enum BestFit {
    /// The expected model.
    ExpectedModel,
    /// Another known model.
    AnotherKnownModel { best_known: Fitting },
    /// No known model fits. This will happen if validation of the best fitting
    /// model failed; the best fitting (but still wrong) model will be included.
    NotFound { best_known: Fitting },
}

/// Check if there's a better fit in known default models, then validate
/// the presumptive best fit.
pub(crate) fn find_best_fit<M: Model + 'static>(
    expected_model: M,
    xs: &[f64],
    ys: &[f64],
    known_models: KnownModels,
) -> BestFit {
    let fastest_growing = known_models.fastest_growing();
    let result = expected_or_other_best_fit(expected_model, xs, ys, known_models);
    let validates = match result {
        Ok(ref fitting) | Err(ref fitting) => {
            if fastest_growing.is_none() || (Some(&fitting.model) == fastest_growing.as_ref()) {
                validate_fit(fitting, xs, ys)
            } else {
                // There are known faster-growing models that didn't fit, so no
                // need to validate against yet another faster-growing model.
                true
            }
        }
    };
    match (result, validates) {
        (Ok(best_known) | Err(best_known), false) => BestFit::NotFound { best_known },
        (Ok(_), true) => BestFit::ExpectedModel,
        (Err(best_known), true) => BestFit::AnotherKnownModel { best_known },
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use rstest::rstest;

    #[macro_export]
    macro_rules! assert_matches {
        ($expression:expr, $pattern:pat) => {
            assert!(matches!($expression, $pattern));
        };
    }

    /// If there is (somehow) a downslope, best we can do is constant model.
    #[test]
    fn downslope_disallowed() {
        let xs = [10., 20., 30., 40.];
        let ys = [100., 90., 80., 70.];
        assert!(expected_or_other_best_fit(Constant, &xs, &ys, KnownModels::default()).is_ok());
        assert_matches!(
            find_best_fit(Constant, &xs, &ys, KnownModels::default()),
            BestFit::ExpectedModel
        );
    }

    /// Vaguely constant data results in Constant model.
    #[test]
    fn constant_matching() {
        let xs = [10., 20., 30., 40.];
        let ys = [100., 90., 80., 110.];
        assert!(expected_or_other_best_fit(Constant, &xs, &ys, KnownModels::default()).is_ok());
        assert_matches!(
            find_best_fit(Constant, &xs, &ys, KnownModels::default()),
            BestFit::ExpectedModel
        );
    }

    #[rstest]
    #[case(N)]
    #[case(Log(N))]
    #[case(Sqrt(N))]
    #[case(N * Log(N))]
    #[case(Log(Log(N)))]
    #[case(N2)]
    #[case(N3)]
    fn model_output_fits_itself<M: Model + PartialEq + Clone + 'static>(#[case] model: M) {
        let xs = [
            100.0,
            140.,
            200.,
            300.,
            500.,
            700.,
            1000.0,
            1400.,
            2000.,
            3000.,
            5000.,
            7000.,
            10_000.0f64,
        ];
        let errors = [1.0001, 0.999].iter().cycle();
        for (a, b) in [(50., 2.), (500_000., 1_000_000.)] {
            let ys = xs
                .iter()
                .zip(errors.clone())
                .map(|(x, err)| (a + b * model.complexity(*x)) * err)
                .collect::<Vec<f64>>();
            let best_fit =
                expected_or_other_best_fit(model.clone(), &xs, &ys, KnownModels::default());
            assert!(best_fit.is_ok());
            assert!(validate_fit(&best_fit.unwrap(), &xs, &ys));
            assert_matches!(
                find_best_fit(model.clone(), &xs, &ys, KnownModels::default()),
                BestFit::ExpectedModel
            );
        }
    }

    #[test]
    fn unknown_model() {
        // An unknown model will get matched to the closest model when using
        // find_better_fit().
        let xs: Vec<f64> = (1..20).map(|i| 10. * 1.4_f64.powi(i)).collect();
        let ys: Vec<f64> = xs.iter().map(|x| x.powi(4)).collect();

        // Of default known models, N^3 is best fit for N^4.
        let best_fit =
            expected_or_other_best_fit(Constant, &xs, &ys, KnownModels::default()).unwrap_err();
        assert_eq!(best_fit.model, BoxedModel::new(N3));

        // However, if we validate that N^3 is the best match, that should fail.
        assert!(!validate_fit(&best_fit, &xs, &ys));
        assert_matches!(
            find_best_fit(Constant, &xs, &ys, KnownModels::default()),
            BestFit::NotFound { .. }
        );

        // N^4 should validate, however:
        let best_fit =
            expected_or_other_best_fit(Pow(4.0), &xs, &ys, KnownModels::default()).unwrap();
        assert_eq!(best_fit.model, BoxedModel::new(Pow(4.)));
        assert!(validate_fit(&best_fit, &xs, &ys));
        assert_matches!(
            find_best_fit(Pow(4.), &xs, &ys, KnownModels::default()),
            BestFit::ExpectedModel
        );

        // And if it's a known model, we do find it:
        let BestFit::AnotherKnownModel { best_known } =
            find_best_fit(Constant, &xs, &ys, KnownModels::default().with(Pow(4.)))
        else {
            panic!("oops")
        };
        assert_eq!(best_known.model, BoxedModel::new(Pow(4.)));
    }
}