RustQuant_ml 0.4.0

A Rust library for quantitative finance.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// RustQuant: A Rust library for quantitative finance tools.
// Copyright (C) 2023 https://github.com/avhz
// Dual licensed under Apache 2.0 and MIT.
// See:
//      - LICENSE-APACHE.md
//      - LICENSE-MIT.md
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

//! Module for logistic regression (classification) algorithms.

// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// IMPORTS
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

use crate::ActivationFunction;
use nalgebra::{DMatrix, DVector};

// use std::f64::EPSILON as EPS;

// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// STRUCTS, ENUMS, AND TRAITS
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

/// Struct to hold the input data for a logistic regression.
#[allow(clippy::module_name_repetitions)]
#[derive(Clone, Debug)]
pub struct LogisticRegressionInput<T> {
    /// The input data matrix, also known as the design matrix.
    /// You do not need to add a column of ones to the design matrix,
    /// as this is done automatically.
    pub x: DMatrix<T>,
    /// The output data vector, also known as the response vector.
    /// The values of the response vector should be either 0 or 1.
    pub y: DVector<T>,
}

/// Struct to hold the output data for a logistic regression.
#[allow(clippy::module_name_repetitions)]
#[derive(Clone, Debug)]
pub struct LogisticRegressionOutput<T> {
    /// The coefficients of the logistic regression,
    /// often denoted as b0, b1, b2, ..., bn.
    /// The first coefficient is the intercept (aka. b0 or alpha).
    pub coefficients: DVector<T>,
    /// Number of iterations required to converge.
    pub iterations: usize,
}

/// Algorithm to use for logistic regression.
#[allow(clippy::module_name_repetitions)]
#[derive(Copy, Clone)]
pub enum LogisticRegressionAlgorithm {
    /// Maximum Likelihood Estimation using Algorithmic Adjoint Differentiation
    /// See: <https://en.wikipedia.org/wiki/Logistic_regression#Maximum_likelihood_estimation_(MLE)>
    MLE,
    /// Iterative Reweighted Least Squares
    /// From Wikipedia (<https://en.wikipedia.org/wiki/Logistic_regression#Iteratively_reweighted_least_squares_(IRLS)>):
    /// """
    /// Binary logistic regression can, be calculated using
    /// iteratively reweighted least squares (IRLS), which is equivalent to
    /// maximizing the log-likelihood of a Bernoulli
    /// distributed process using Newton's method.
    /// """
    ///
    /// References:
    ///     - Elements of Statistical Learning (Hastie, Tibshirani, Friedman 2009)
    ///     - Machine Learning: A Probabilistic Perspective (Murphy, Kevin P. 2012)
    IRLS,
}

// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// IMPLEMENTATIONS
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
type PrepareInputResult = Result<(DMatrix<f64>, DMatrix<f64>, DVector<f64>), &'static str>;

impl LogisticRegressionInput<f64> {
    /// Create a new `LogisticRegressionInput` struct.
    ///
    /// # Panics
    ///
    /// Panics if the number of rows in x are not equal to the length of y.
    #[must_use]
    pub fn new(x: DMatrix<f64>, y: DVector<f64>) -> Self {
        assert!(x.nrows() == y.len());

        Self { x, y }
    }

    /// Function to validate and prepare the input data.
    fn prepare_input(&self) -> PrepareInputResult {
        // Check that the response vector is either 0 or 1.
        if self
            .y
            .iter()
            .any(|&x| x.abs() < f64::EPSILON && (x - 1.0).abs() < f64::EPSILON)
        {
            return Err("The elements of the response vector should be either 0 or 1.");
        }

        // Check dimensions match.
        let (n_rows, _) = self.x.shape();

        if n_rows != self.y.len() {
            return Err("The number of rows in the design matrix should match the length of the response vector.");
        }

        // Check the input data is finite.
        if self.x.iter().any(|&x| !x.is_finite()) || self.y.iter().any(|&x| !x.is_finite()) {
            return Err("The input data should be finite.");
        }

        // Add a column of ones to the design matrix.
        let x = self.x.clone().insert_column(0, 1.0);

        // Also return the transpose of the design matrix.
        Ok((x.clone(), x.transpose(), self.y.clone()))
    }

    /// Function to validate and prepare the output data.
    fn prepare_output(&self) -> LogisticRegressionOutput<f64> {
        // Initial guess for the coefficients.
        // hyperplane orthogonal to line between  means of class 0 and 1; plane goes through the location of (weighted) mean of both clusters

        let (_n_sample, n_feat) = self.x.shape();
        let ones = DVector::from_element(n_feat, 1.).transpose();

        // Calculate mean of features of samples of class 0
        let mask0 = &self.y * &ones;
        let x0_mean = self.x.component_mul(&mask0).row_mean();

        // Calculate mean of features of samples of class 1
        let mask1 = (-&mask0).add_scalar(1.);
        let x1_mean = self.x.component_mul(&mask1).row_mean();

        //vector from x0_mean to x1_mean
        let delta = &x1_mean - &x0_mean;

        //fraction of samples in class 1 , used as weight
        let y_mean = self.y.mean();
        let mid = x1_mean * y_mean + x0_mean * (1. - y_mean);

        //compute projection of weighted mean of class on direction delta
        let scaler = mid.dot(&delta) / delta.magnitude_squared();
        let dir = delta * scaler;

        // <dir,x>=|dir|^2 is the plane orthogonal to dir with distance |dir| from the origin
        let bias = -dir.magnitude_squared();
        let coef = dir.insert_column(0, bias);

        // Return the output struct, with the initial guess for the coefficients.
        LogisticRegressionOutput {
            coefficients: coef.transpose(),
            iterations: 0,
        }
    }

    /// Fit a logistic regression model to the input data.
    pub fn fit(
        &self,
        method: LogisticRegressionAlgorithm,
        tolerance: f64,
    ) -> Result<LogisticRegressionOutput<f64>, &'static str> {
        // Validate and prepare the input and output data.
        let (X, X_T, y) = self.prepare_input()?;
        let mut output = self.prepare_output();

        // Number of rows and columns in the design matrix.
        let (n_rows, n_cols) = X.shape();

        // Vector of ones.
        let ones_samples = DVector::from_element(n_rows, 1.);
        let ones_features = DVector::from_element(n_cols, 1.);

        // Vector of coefficients that we update each iteration.
        let mut coefs = DVector::zeros(n_cols);

        match method {
            // MAXIMUM LIKELIHOOD ESTIMATION
            // Using Algorithmic Adjoint Differentiation (AAD)
            // from the `autodiff` module.
            LogisticRegressionAlgorithm::MLE => unimplemented!(),

            // ITERATIVELY RE-WEIGHTED LEAST SQUARES
            // References:
            //      - Elements of Statistical Learning (Hastie, Tibshirani, Friedman 2009)
            //      - Machine Learning: A Probabilistic Perspective (Murphy, Kevin P. 2012)
            LogisticRegressionAlgorithm::IRLS => {
                let mut eta: DVector<f64>;
                let mut mu: DVector<f64>;

                // While not converged.
                // Convergence is defined as the norm of the change in
                // the weights being less than the tolerance.
                while (&coefs - &output.coefficients).norm() >= tolerance {
                    eta = &X * &output.coefficients;
                    mu = ActivationFunction::logistic(&eta);
                    //multiplication of matrix  with diagonal matrix equals elementwise multiplication of each row / col  with diagonal entries
                    //can be realized by elementwise multiplication with ones * diag_entries.T
                    //
                    let diag_entries = &mu.component_mul(&(&ones_samples - &mu));

                    // Break if data turns out to be linearly separable.
                    if (&y - &mu).max() < tolerance {
                        break;
                    }

                    // For diag-matrix product as elementwise.
                    let diag_repeated = &ones_features * diag_entries.transpose();
                    let X_T_W = X_T.component_mul(&diag_repeated);
                    let hessian = &X_T_W * &X;
                    let z = &X_T * (&y - &mu);
                    let delta_coefs = hessian
                        .lu()
                        .solve(&z)
                        .unwrap_or_else(|| panic!("IRLS[{}]:", output.iterations));

                    coefs = &output.coefficients + delta_coefs;

                    output.iterations += 1;

                    std::mem::swap(&mut output.coefficients, &mut coefs);
                }
            }
        }

        Ok(output)
    }
}

impl LogisticRegressionOutput<f64> {
    /// Predicts the output for the given input data.
    #[must_use]
    pub fn predict(&self, input: &DMatrix<f64>) -> DVector<f64> {
        let probabilities = self.predict_proba(input);

        // Predictions (y_hat)
        probabilities.map(|p| if p > 0.5 { 1. } else { 0. })
    }

    /// Compute the probabilities $Pr(output_i=1|input_i,coefficients)$ for the given input data.
    #[must_use]
    pub fn predict_proba(&self, input: &DMatrix<f64>) -> DVector<f64> {
        let coef = self.coefficients.clone();
        let bias = coef[0];
        let n = coef.remove_row(0);
        let eta = (input * n).add_scalar(bias);

        // Probabilities
        ActivationFunction::logistic(&eta)
    }

    /// Compute the misclassification rate for given `y` and `y_hat`.
    ///
    /// # Panics
    ///
    /// Panics if the shape of `y` is not equal to the shape of `y_hat`.
    #[must_use]
    pub fn score_misclassification(&self, y: &DVector<f64>, y_hat: &DVector<f64>) -> f64 {
        assert_eq!(y.shape(), y_hat.shape());
        let (N_samples, _) = y.shape();
        (y - y_hat).abs().sum() / N_samples as f64
    }

    /// Compute average cross-entropy for given y and `p_hat`.
    ///
    /// # Panics
    ///
    /// Panics if the shape of `y` is not equal to the shape of `p_hat`.
    #[must_use]
    pub fn score_cross_entropy(&self, y: &DVector<f64>, p_hat: &DVector<f64>) -> f64 {
        //could be done with only one param input:&LogisticRegressionInput
        assert_eq!(y.shape(), p_hat.shape());

        let y_complement = (-y).add_scalar(1.);
        let p_complement = (-p_hat).add_scalar(1.);
        let log_p = p_hat.map(f64::ln);
        let log_p_complement = p_complement.map(f64::ln);

        // Cross-entropy
        (y.component_mul(&log_p) + y_complement.component_mul(&log_p_complement)).mean()
    }
}

// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// UNIT TESTS
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

#[cfg(test)]
mod tests_logistic_regression {
    use super::*;
    use std::time::Instant;
    use RustQuant_math::distributions::DistributionClass;
    // use crate::assert_approx_equal;

    #[test]
    fn test_logistic_regression() {
        // PROFILE THIS UNIT TEST WITH (on MacOS):
        // sudo -E cargo flamegraph --release --freq 5000 --unit-test -- tests_logistic_regression::test_logistic_regression

        // TEST DATA GENERATED FROM THE FOLLOWING R v4.3.0 CODE:
        //
        // set.seed(1234)
        //
        // features    <- c("x1", "x2", "x3")
        //
        // (x_train  <- data.frame(matrix(rnorm(12), 4, 3))); colnames(x_train) <- features
        // (x_test   <- data.frame(matrix(rnorm(12), 4, 3))); colnames(x_test)  <- features
        //
        // (response <- sample(c(0,1), 4, replace = TRUE))
        //
        // (model <- glm(response ~ ., data = x_train, family = binomial))
        // (preds <- predict(model, newdata = x_test, type = "response"))

        #[rustfmt::skip]
        let x_train = DMatrix::from_row_slice(
            4, // rows
            3, // columns
            &[-1.207_065_7,  0.429_124_7, -0.564_452_0,
               0.277_429_2,  0.506_055_9, -0.890_037_8,
               1.084_441_2, -0.574_740_0, -0.477_192_7,
              -2.345_697_7, -0.546_631_9, -0.998_386_4],
        );

        #[rustfmt::skip]
        let _x_test = DMatrix::from_row_slice(
            4, // rows
            3, // columns
            &[-0.776_253_89, -0.511_009_5,  0.134_088_2,
               0.064_458_82, -0.911_195_4, -0.490_685_9,
               0.959_494_06, -0.837_171_7, -0.440_547_9,
              -0.110_285_49,  2.415_835_2,  0.459_589_4],
        );

        let response = DVector::from_row_slice(&[0., 1., 1., 1.]);

        // Fit the model to the training data.
        let input = LogisticRegressionInput {
            x: x_train,
            y: response,
        };

        let start_none = Instant::now();
        let output = input.fit(LogisticRegressionAlgorithm::IRLS, f64::EPSILON.sqrt());
        let elapsed_none = start_none.elapsed();

        match output {
            Ok(output) => {
                println!("Iterations: \t{}", output.iterations);
                println!("Time taken: \t{:?}", elapsed_none);
                // println!("Intercept: \t{:?}", output.intercept);
                println!("Coefficients: \t{:?}", output.coefficients);
            }
            Err(err) => {
                panic!("Failed to fit logistic regression model: {}", err);
            }
        }

        // // Predict the response for the test data.
        // let preds = output.predict(x_test);

        // // Check intercept.
        // assert_approx_equal!(output.intercept, 0.45326734, 1e-6);

        // // Check coefficients.
        // for (i, coefficient) in output.coefficients.iter().enumerate() {
        //     assert_approx_equal!(
        //         coefficient,
        //         &[0.45326734, 1.05986612, -0.16909348, 2.29605328][i],
        //         1e-6
        //     );
        // }

        // // Check predictions.
        // for (i, pred) in preds.iter().enumerate() {
        //     assert_approx_equal!(
        //         pred,
        //         &[0.0030197504, 0.4041016953, 2.4605541127, 1.6571889522][i],
        //         1e-3
        //     );
        // }
    }

    #[test]
    fn test_logistic_regression_stochastic() {
        // cargo test --release   tests_logistic_regression::test_logistic_regression2 -- --nocapture

        // The test generates sample data in the following way:
        // - For each of the N samples (train/test) draw K feature values each from a uniform distribution over (-1.,1.) and arrange as design matrix "X".
        // - For the coefficients of the generating distribution draw K values from surface of the unit sphere S_(K-1)  and a bias from uniform(-0.7,0.7); arrange as DVector "coefs"
        // - compute vector of probabilities(target=1) as sigmoid(X_ext * coefs)
        // - compute target values:for each sample i draw from Bernoulli(prob_i)
        use RustQuant_math::distributions::{Bernoulli, Distribution, Gaussian, Uniform};

        let N_train = 200; //Number of training samples
        let N_test = 40; //Number of test samples
        let K = 10; //Number of Features

        let distr_normal = Gaussian::default();
        let distr_uniform_bias = Uniform::new(-0.7, 0.7, DistributionClass::Continuous);
        let distr_uniform_steepness = Uniform::new(0.5, 5., DistributionClass::Continuous);

        //generate random coefficients which will be used to generate target values for the x_i (direction uniform from sphere, bias uniform between -0.5 and 0.5 ) scaled by steepness
        let bias = distr_uniform_bias.sample(1).unwrap()[0];
        let steepness = distr_uniform_steepness.sample(1).unwrap()[0];

        let coefs = DVector::from_vec(distr_normal.sample(K).unwrap())
            .normalize()
            .insert_row(0, bias)
            .scale(steepness);

        let logistic_regression = LogisticRegressionOutput {
            coefficients: coefs,
            iterations: 0,
        };

        //generate random design matrix for train/test
        let distr_uniform_features = Uniform::new(-0.5, 0.5, DistributionClass::Continuous);

        let x_train = DMatrix::<f64>::from_vec(
            N_train,
            K,
            distr_uniform_features.sample(N_train * K).unwrap(),
        );
        let x_test = DMatrix::from_vec(
            N_test,
            K,
            distr_uniform_features.sample(N_test * K).unwrap(),
        );

        //compute probabilities for each sample x_i
        let probs_train = logistic_regression.predict_proba(&x_train);
        let probs_test = logistic_regression.predict_proba(&x_test);

        // sample from Bernoulli distribution with p=p_i for each sample i
        let y_train = probs_train.map(|p| Bernoulli::new(p).sample(1).unwrap()[0]);
        let y_test = probs_test.map(|p| Bernoulli::new(p).sample(1).unwrap()[0]);

        // Fit the model to the training data.
        let input = LogisticRegressionInput {
            x: x_train,
            y: y_train,
        };

        let start_none = Instant::now();
        let output = input.fit(LogisticRegressionAlgorithm::IRLS, f64::EPSILON.sqrt());
        let elapsed_none = start_none.elapsed();

        match output {
            Ok(output) => {
                let y_hat = output.predict(&x_test);
                let misclassification_rate = output.score_misclassification(&y_test, &y_hat);
                let p_hat = output.predict_proba(&x_test);
                let crossentropy = output.score_cross_entropy(&y_test, &p_hat);
                println!(
                    "Number of samples N_train={}, N_test={}, number of Features K={}",
                    N_train, N_test, K
                );
                println!(
                    "Misclassification_rate(out of sample): \t{}",
                    misclassification_rate
                );
                println!("Avg crossentropy(out of sample): \t{}", crossentropy);
                println!("Iterations: \t{}", output.iterations);
                println!("Time taken: \t{:?}", elapsed_none);
                // println!("Intercept: \t{:?}", output.intercept);
                // print computed coeffs and original coeffs
                println!("Coefficients found by IRLS:\n{:?}", &output.coefficients);
                println!(
                    "Coefficients used for the generation of the training data:\n{:?}",
                    &logistic_regression.coefficients
                );
            }
            Err(err) => {
                panic!("Failed to fit logistic regression model: {}", err);
            }
        }
    }
}