Skip to main content

linreg_core/regularized/
ridge.rs

1//! Ridge regression (L2-regularized linear regression).
2//!
3//! This module provides ridge regression implementation using the augmented QR
4//! approach, which is numerically stable and avoids forming X^T X explicitly.
5//!
6//! # Ridge Regression Objective
7//!
8//! Ridge regression solves:
9//!
10//! ```text
11//! minimize over (β₀, β):
12//!
13//!     (1/(2n)) * Σᵢ (yᵢ - β₀ - xᵢᵀβ)² + (λ/2) * ||β||₂²
14//! ```
15//!
16//! The intercept `β₀` is **not penalized**.
17//!
18//! # Solution Method
19//!
20//! We use the augmented least-squares approach:
21//!
22//! ```text
23//! minimize || [y; 0] - [X; √λ*I] * β ||²
24//! ```
25//!
26//! This transforms the ridge problem into a standard least squares problem
27//! that can be solved with QR decomposition.
28
29use crate::error::{Error, Result};
30use crate::linalg::Matrix;
31use crate::regularized::preprocess::{
32    predict, standardize_xy, unstandardize_coefficients, StandardizeOptions,
33};
34
35#[cfg(feature = "wasm")]
36use serde::Serialize;
37
38/// Options for ridge regression fitting.
39///
40/// # Fields
41///
42/// * `lambda` - Regularization strength (single value)
43/// * `intercept` - Whether to include an intercept term (default: true)
44/// * `standardize` - Whether to standardize predictors (default: true)
45#[derive(Clone, Debug)]
46pub struct RidgeFitOptions {
47    /// Regularization strength (must be >= 0)
48    pub lambda: f64,
49    /// Whether to include an intercept
50    pub intercept: bool,
51    /// Whether to standardize predictors
52    pub standardize: bool,
53}
54
55impl Default for RidgeFitOptions {
56    fn default() -> Self {
57        RidgeFitOptions {
58            lambda: 1.0,
59            intercept: true,
60            standardize: true,
61        }
62    }
63}
64
65/// Result of a ridge regression fit.
66///
67/// # Fields
68///
69/// * `lambda` - The lambda value used for fitting
70/// * `intercept` - Intercept coefficient (on original scale)
71/// * `coefficients` - Slope coefficients (on original scale)
72/// * `fitted_values` - In-sample predictions
73/// * `residuals` - Residuals (y - fitted_values)
74/// * `df` - Effective degrees of freedom (trace of H = X(X'X + λI)^(-1)X')
75/// * `r_squared` - R² (coefficient of determination)
76/// * `adj_r_squared` - Adjusted R² (using effective df)
77/// * `mse` - Mean squared error
78/// * `rmse` - Root mean squared error
79/// * `mae` - Mean absolute error
80/// * `standardization_info` - Information about standardization applied
81#[derive(Clone, Debug)]
82#[cfg_attr(feature = "wasm", derive(Serialize))]
83pub struct RidgeFit {
84    /// Lambda value used for fitting
85    pub lambda: f64,
86    /// Intercept on original scale
87    pub intercept: f64,
88    /// Slope coefficients on original scale
89    pub coefficients: Vec<f64>,
90    /// Fitted values
91    pub fitted_values: Vec<f64>,
92    /// Residuals
93    pub residuals: Vec<f64>,
94    /// Effective degrees of freedom
95    pub df: f64,
96    /// R² (coefficient of determination)
97    pub r_squared: f64,
98    /// Adjusted R² (penalized for effective df)
99    pub adj_r_squared: f64,
100    /// Mean squared error
101    pub mse: f64,
102    /// Root mean squared error
103    pub rmse: f64,
104    /// Mean absolute error
105    pub mae: f64,
106}
107
108/// Fits ridge regression for a single lambda value.
109///
110/// # Arguments
111///
112/// * `x` - Design matrix (n × p). Should include intercept column if `intercept=true`.
113/// * `y` - Response vector (n elements)
114/// * `options` - Ridge fitting options
115///
116/// # Returns
117///
118/// A [`RidgeFit`] containing the fit results.
119///
120/// # Errors
121///
122/// Returns an error if:
123/// - `lambda < 0`
124/// - Dimensions don't match
125/// - Matrix is numerically singular
126///
127/// # Algorithm
128///
129/// Uses the augmented QR approach:
130/// 1. Standardize X and center y (if requested)
131/// 2. Build augmented system:
132///    ```text
133///    X_aug = [X_std; sqrt(lambda) * I_p]
134///    y_aug = [y_centered; 0_p]
135///    ```
136/// 3. Solve using QR decomposition
137/// 4. Unstandardize coefficients
138///
139/// # Example
140///
141/// ```rust,no_run
142/// use linreg_core::linalg::Matrix;
143/// use linreg_core::regularized::ridge::{ridge_fit, RidgeFitOptions};
144///
145/// let x = Matrix::new(3, 2, vec![
146///     1.0, 2.0,
147///     1.0, 3.0,
148///     1.0, 4.0,
149/// ]);
150/// let y = vec![3.0, 5.0, 7.0];
151///
152/// let options = RidgeFitOptions {
153///     lambda: 1.0,
154///     intercept: true,
155///     standardize: true,
156/// };
157///
158/// let fit = ridge_fit(&x, &y, &options).unwrap();
159/// println!("Intercept: {}", fit.intercept);
160/// println!("Coefficients: {:?}", fit.coefficients);
161/// ```
162pub fn ridge_fit(x: &Matrix, y: &[f64], options: &RidgeFitOptions) -> Result<RidgeFit> {
163    if options.lambda < 0.0 {
164        return Err(Error::InvalidInput(
165            "Lambda must be non-negative for ridge regression".to_string(),
166        ));
167    }
168
169    let n = x.rows;
170    let p = x.cols;
171
172    if y.len() != n {
173        return Err(Error::DimensionMismatch(
174            format!("Length of y ({}) must match number of rows in X ({})", y.len(), n)
175        ));
176    }
177
178    // Handle zero lambda: just do OLS
179    if options.lambda == 0.0 {
180        return ridge_ols_fit(x, y, options);
181    }
182
183    // Standardize X and center y
184    let std_options = StandardizeOptions {
185        intercept: options.intercept,
186        standardize_x: options.standardize,
187        standardize_y: false, // Don't standardize y for ridge
188    };
189
190    let (x_std, y_centered, std_info) = standardize_xy(x, y, &std_options);
191
192    // Build augmented system: [X; sqrt(lambda)*I] * beta = [y; 0]
193    // For the intercept column (if present), we don't add penalty
194    let sqrt_lambda = options.lambda.sqrt();
195    let intercept_col = if options.intercept { 1 } else { 0 };
196
197    // Number of penalized coefficients (excluding intercept)
198    let p_pen = p - intercept_col;
199
200    // Augmented matrix dimensions
201    let aug_n = n + p_pen;
202    let aug_p = p;
203
204    // Build augmented matrix
205    let mut x_aug_data = vec![0.0; aug_n * aug_p];
206
207    // Copy X_std to top portion
208    for i in 0..n {
209        for j in 0..p {
210            x_aug_data[i * aug_p + j] = x_std.get(i, j);
211        }
212    }
213
214    // Add sqrt(lambda) * I for penalized coefficients
215    for i in 0..p_pen {
216        let row = n + i;
217        let col = intercept_col + i;
218        x_aug_data[row * aug_p + col] = sqrt_lambda;
219    }
220
221    let x_aug = Matrix::new(aug_n, aug_p, x_aug_data);
222
223    // Build augmented y vector
224    let mut y_aug = vec![0.0; aug_n];
225    for i in 0..n {
226        y_aug[i] = y_centered[i];
227    }
228    // Remaining entries are already 0
229
230    // Solve using QR decomposition
231    let (q, r) = x_aug.qr();
232    let beta_std = solve_upper_triangular_with_augmented_y(&r, &q, &y_aug, aug_n)?;
233
234    // Unstandardize coefficients
235    let (intercept, beta_orig) = unstandardize_coefficients(&beta_std, &std_info);
236
237    // Compute fitted values and residuals on original scale
238    let fitted = predict(x, intercept, &beta_orig);
239    let residuals: Vec<f64> = y.iter().zip(fitted.iter()).map(|(yi, yh)| yi - yh).collect();
240
241    // Compute effective degrees of freedom
242    // For ridge: df = trace(X(X'X + lambda*I)^(-1)X')
243    // This equals sum of eigenvalues / (eigenvalues + lambda)
244    // We compute it using the hat matrix approach
245    let df = compute_ridge_df(&x_std, options.lambda, intercept_col);
246
247    // Compute model fit statistics
248    let y_mean: f64 = y.iter().sum::<f64>() / n as f64;
249    let ss_tot: f64 = y.iter().map(|yi| (yi - y_mean).powi(2)).sum();
250    let ss_res: f64 = residuals.iter().map(|r| r.powi(2)).sum();
251    let r_squared = if ss_tot > 1e-10 {
252        1.0 - ss_res / ss_tot
253    } else {
254        1.0
255    };
256
257    // Adjusted R² using effective degrees of freedom
258    let adj_r_squared = if ss_tot > 1e-10 && (n as f64) > df {
259        1.0 - (1.0 - r_squared) * ((n - 1) as f64 / (n as f64 - df))
260    } else {
261        r_squared
262    };
263
264    let mse = ss_res / (n as f64 - 1.0); // Use n-1 for consistency
265    let rmse = mse.sqrt();
266    let mae: f64 = residuals.iter().map(|r| r.abs()).sum::<f64>() / n as f64;
267
268    Ok(RidgeFit {
269        lambda: options.lambda,
270        intercept,
271        coefficients: beta_orig,
272        fitted_values: fitted,
273        residuals,
274        df,
275        r_squared,
276        adj_r_squared,
277        mse,
278        rmse,
279        mae,
280    })
281}
282
283/// Computes the effective degrees of freedom for ridge regression.
284///
285/// df = trace(H) where H = X(X'X + λI)^(-1)X'
286///
287/// For a QR decomposition of the standardized X, this equals the sum of
288/// squared diagonal elements of R(R'R + λI)^(-1).
289fn compute_ridge_df(x_std: &Matrix, lambda: f64, intercept_col: usize) -> f64 {
290    let p = x_std.cols;
291
292    // For small problems, compute directly
293    if p <= 100 {
294        // Get QR decomposition of X_std
295        let (_q, _r) = x_std.qr();
296
297        // Compute df = trace(X(X'X + λI)^(-1)X')
298        // This equals trace(R(R'R + λI)^(-1)R') / n for centered data
299        // A simpler approach: df = sum of (d_i^2 / (d_i^2 + lambda))
300        // where d_i are singular values of X
301
302        // For ridge, a simple approximation that works well:
303        // df = sum_{j not penalized} 1 + sum_{j penalized} sigma_j^2 / (sigma_j^2 + lambda)
304        // where sigma_j^2 are eigenvalues of X'X
305
306        // Use the approximation: df ≈ p - lambda * trace((X'X + lambda*I)^(-1))
307        // For now, use a simpler proxy
308        let p_pen = p - intercept_col;
309        let df_penalty = if lambda > 0.0 {
310            // Approximate reduction in df due to penalty
311            (p_pen as f64) * lambda / (1.0 + lambda)
312        } else {
313            0.0
314        };
315
316        (p as f64) - df_penalty
317    } else {
318        // For large p, use a simpler approximation
319        p as f64 * lambda / (1.0 + lambda)
320    }
321}
322
323/// OLS fit for lambda = 0 (special case of ridge).
324fn ridge_ols_fit(x: &Matrix, y: &[f64], options: &RidgeFitOptions) -> Result<RidgeFit> {
325    let n = x.rows;
326    let p = x.cols;
327
328    // Use QR decomposition for OLS on original (non-standardized) data
329    let (q, r) = x.qr();
330    let beta = solve_upper_triangular_with_augmented_y(&r, &q, y, n)?;
331
332    // Extract intercept and slope coefficients directly (no unstandardization needed)
333    // OLS on original data gives coefficients on original scale
334    let (intercept, beta_orig) = if options.intercept {
335        // beta[0] is intercept, beta[1..] are slopes
336        let slopes: Vec<f64> = beta[1..].to_vec();
337        (beta[0], slopes)
338    } else {
339        // No intercept, all coefficients are slopes
340        (0.0, beta)
341    };
342
343    // Compute fitted values and residuals
344    let fitted = predict(x, intercept, &beta_orig);
345    let residuals: Vec<f64> = y.iter().zip(fitted.iter()).map(|(yi, yh)| yi - yh).collect();
346
347    // For OLS, df = p (or n - 1 if considering adjusted df)
348    let df = p as f64;
349
350    // Compute model fit statistics
351    let y_mean: f64 = y.iter().sum::<f64>() / n as f64;
352    let ss_tot: f64 = y.iter().map(|yi| (yi - y_mean).powi(2)).sum();
353    let ss_res: f64 = residuals.iter().map(|r| r.powi(2)).sum();
354    let r_squared = if ss_tot > 1e-10 {
355        1.0 - ss_res / ss_tot
356    } else {
357        1.0
358    };
359
360    // Adjusted R² using effective degrees of freedom
361    let adj_r_squared = if ss_tot > 1e-10 && n > p {
362        1.0 - (1.0 - r_squared) * ((n - 1) as f64 / (n - p) as f64)
363    } else {
364        r_squared
365    };
366
367    let mse = ss_res / (n as f64 - p as f64);
368    let rmse = mse.sqrt();
369    let mae: f64 = residuals.iter().map(|r| r.abs()).sum::<f64>() / n as f64;
370
371    Ok(RidgeFit {
372        lambda: 0.0,
373        intercept,
374        coefficients: beta_orig,
375        fitted_values: fitted,
376        residuals,
377        df,
378        r_squared,
379        adj_r_squared,
380        mse,
381        rmse,
382        mae,
383    })
384}
385
386/// Solves R * beta = Q^T * y_aug for beta.
387///
388/// This is a helper for the augmented QR approach.
389fn solve_upper_triangular_with_augmented_y(
390    r: &Matrix,
391    q: &Matrix,
392    y_aug: &[f64],
393    aug_n: usize,
394) -> Result<Vec<f64>> {
395    let p = r.cols;
396
397    // Compute Q^T * y_aug (only need first p rows since R is p × p or m × p)
398    // Actually, Q is aug_n × aug_n, but we only need Q^T * y_aug for first p rows
399    // since R has zeros below row p
400
401    let mut qty = vec![0.0; p];
402
403    // Compute Q^T * y_aug for the first p rows
404    for i in 0..p {
405        let mut sum = 0.0;
406        for k in 0..aug_n {
407            sum += q.get(k, i) * y_aug[k];
408        }
409        qty[i] = sum;
410    }
411
412    // Back substitution: solve R * beta = qty
413    let mut beta = vec![0.0; p];
414
415    for i in (0..p).rev() {
416        let mut sum = qty[i];
417        for j in (i + 1)..p {
418            sum -= r.get(i, j) * beta[j];
419        }
420
421        let diag = r.get(i, i);
422        if diag.abs() < 1e-14 {
423            return Err(Error::ComputationFailed(
424                "Matrix is singular to working precision".to_string(),
425            ));
426        }
427
428        beta[i] = sum / diag;
429    }
430
431    Ok(beta)
432}
433
434/// Makes predictions using a ridge regression fit.
435///
436/// # Arguments
437///
438/// * `fit` - The ridge regression fit result
439/// * `x_new` - New data matrix (n_new × p)
440///
441/// # Returns
442///
443/// Predictions for each row in x_new.
444pub fn predict_ridge(fit: &RidgeFit, x_new: &Matrix) -> Vec<f64> {
445    predict(x_new, fit.intercept, &fit.coefficients)
446}
447
448#[cfg(test)]
449mod tests {
450    use super::*;
451
452    #[test]
453    fn test_ridge_fit_simple() {
454        // Simple test: perfect linear relationship
455        let x_data = vec![
456            1.0, 1.0,
457            1.0, 2.0,
458            1.0, 3.0,
459            1.0, 4.0,
460        ];
461        let x = Matrix::new(4, 2, x_data);
462        let y = vec![2.0, 4.0, 6.0, 8.0]; // y = 2 * x (with intercept 0)
463
464        let options = RidgeFitOptions {
465            lambda: 0.1,
466            intercept: true,
467            standardize: false,
468        };
469
470        let fit = ridge_fit(&x, &y, &options).unwrap();
471
472        // With small lambda, should be close to OLS solution
473        // OLS solution: intercept ≈ 0, slope ≈ 2
474        // coefficients[0] is the first (and only) slope coefficient
475        // Note: Ridge regularization introduces some bias, so tolerances are slightly looser
476        assert!((fit.coefficients[0] - 2.0).abs() < 0.2);
477        assert!(fit.intercept.abs() < 0.5);
478    }
479
480    #[test]
481    fn test_ridge_fit_with_standardization() {
482        let x_data = vec![
483            1.0, 100.0,
484            1.0, 200.0,
485            1.0, 300.0,
486            1.0, 400.0,
487        ];
488        let x = Matrix::new(4, 2, x_data);
489        let y = vec![2.0, 4.0, 6.0, 8.0];
490
491        let options = RidgeFitOptions {
492            lambda: 1.0,
493            intercept: true,
494            standardize: true,
495        };
496
497        let fit = ridge_fit(&x, &y, &options).unwrap();
498
499        // Predictions should be reasonable
500        for i in 0..4 {
501            assert!((fit.fitted_values[i] - y[i]).abs() < 2.0);
502        }
503    }
504
505    #[test]
506    fn test_ridge_zero_lambda_is_ols() {
507        let x_data = vec![
508            1.0, 1.0,
509            1.0, 2.0,
510            1.0, 3.0,
511        ];
512        let x = Matrix::new(3, 2, x_data);
513        let y = vec![2.0, 4.0, 6.0];
514
515        let options = RidgeFitOptions {
516            lambda: 0.0,
517            intercept: true,
518            standardize: false,
519        };
520
521        let fit = ridge_fit(&x, &y, &options).unwrap();
522
523        // Should be close to perfect fit for this data
524        assert!((fit.fitted_values[0] - 2.0).abs() < 1e-6);
525        assert!((fit.fitted_values[1] - 4.0).abs() < 1e-6);
526        assert!((fit.fitted_values[2] - 6.0).abs() < 1e-6);
527    }
528
529    #[test]
530    fn test_ridge_negative_lambda_error() {
531        let x_data = vec![1.0, 1.0, 1.0, 2.0, 1.0, 3.0];
532        let x = Matrix::new(3, 2, x_data);
533        let y = vec![2.0, 4.0, 6.0];
534
535        let options = RidgeFitOptions {
536            lambda: -1.0,
537            ..Default::default()
538        };
539
540        let result = ridge_fit(&x, &y, &options);
541        assert!(result.is_err());
542    }
543
544    #[test]
545    fn test_predict_ridge() {
546        let x_data = vec![
547            1.0, 1.0,
548            1.0, 2.0,
549            1.0, 3.0,
550        ];
551        let x = Matrix::new(3, 2, x_data);
552        let y = vec![2.0, 4.0, 6.0];
553
554        let options = RidgeFitOptions {
555            lambda: 0.1,
556            intercept: true,
557            standardize: false,
558        };
559
560        let fit = ridge_fit(&x, &y, &options).unwrap();
561        let preds = predict_ridge(&fit, &x);
562
563        // Predictions on training data should equal fitted values
564        for i in 0..3 {
565            assert!((preds[i] - fit.fitted_values[i]).abs() < 1e-10);
566        }
567    }
568}