linreg_core/regularized/
preprocess.rs

1//! Data preprocessing for regularized regression.
2//!
3//! This module provides standardization utilities that match glmnet's behavior:
4//!
5//! - Predictors are centered and scaled (if enabled)
6//! - The intercept column is not penalized, so it's handled specially
7//! - Coefficients can be unstandardized back to the original scale
8//!
9//! # Standardization Convention
10//!
11//! The scaling factor used is `sqrt(sum(x²) / n)`, which gives unit variance
12//! under the 1/n convention (matching glmnet).
13
14use crate::linalg::{vec_mean, Matrix};
15
16/// Information stored during standardization, used to unstandardize coefficients.
17///
18/// This struct captures all the information needed to transform coefficients
19/// from the standardized space back to the original data scale.
20///
21/// # Fields
22///
23/// * `x_mean` - Mean of each predictor column (length p)
24/// * `x_scale` - Scale factor for each predictor column (length p)
25/// * `y_mean` - Mean of response variable
26/// * `y_scale` - Scale factor for response (optional, used for lambda path)
27/// * `intercept` - Whether an intercept term was included
28/// * `standardized_x` - Whether X was standardized
29/// * `standardized_y` - Whether y was standardized
30#[derive(Clone, Debug)]
31pub struct StandardizationInfo {
32    /// Mean of each predictor column
33    pub x_mean: Vec<f64>,
34    /// Scale factor for each predictor column
35    pub x_scale: Vec<f64>,
36    /// Mean of response variable
37    pub y_mean: f64,
38    /// Scale factor for response (for lambda path construction)
39    pub y_scale: Option<f64>,
40    /// Whether an intercept was included
41    pub intercept: bool,
42    /// Whether X was standardized
43    pub standardized_x: bool,
44    /// Whether y was standardized
45    pub standardized_y: bool,
46}
47
48/// Options for standardization.
49///
50/// # Fields
51///
52/// * `intercept` - Whether to include/center an intercept (default: true)
53/// * `standardize_x` - Whether to standardize predictors (default: true)
54/// * `standardize_y` - Whether to standardize response (default: false)
55///
56/// # Note
57///
58/// Setting `standardize_y` to `true` is mainly useful when you want to match
59/// glmnet's lambda sequence exactly. For single-lambda fits, you typically
60/// don't need to standardize y.
61#[derive(Clone, Debug)]
62pub struct StandardizeOptions {
63    /// Whether to include an intercept (and center X)
64    pub intercept: bool,
65    /// Whether to standardize predictor columns
66    pub standardize_x: bool,
67    /// Whether to standardize the response variable
68    pub standardize_y: bool,
69}
70
71impl Default for StandardizeOptions {
72    fn default() -> Self {
73        StandardizeOptions {
74            intercept: true,
75            standardize_x: true,
76            standardize_y: false,
77        }
78    }
79}
80
81/// Standardizes X and y for regularized regression (glmnet-compatible).
82///
83/// This function performs the same standardization as glmnet with
84/// `standardize=TRUE`. The first column of X is assumed to be the intercept
85/// (all ones) and is NOT standardized.
86///
87/// # Arguments
88///
89/// * `x` - Design matrix (n × p). First column should be intercept if `intercept=true`.
90/// * `y` - Response vector (n elements)
91/// * `options` - Standardization options
92///
93/// # Returns
94///
95/// A tuple `(x_std, y_std, info)` where:
96/// - `x_std` is the standardized design matrix
97/// - `y_std` is the (optionally) standardized response
98/// - `info` contains the standardization parameters for unstandardization
99///
100/// # Standardization Details
101///
102/// For the intercept column (first column, if present):
103/// - Not centered (stays as ones)
104/// - Not scaled
105///
106/// For other columns (if `standardize_x=true`):
107/// - Centered: `x_centered = x - mean(x)`
108/// - Scaled: `x_scaled = x_centered / sqrt(sum(x²) / n)`
109///
110/// For y (if `standardize_y=true`):
111/// - Centered: `y_centered = y - mean(y)`
112/// - Scaled: `y_scaled = y_centered / sqrt(sum(y²) / n)`
113pub fn standardize_xy(
114    x: &Matrix,
115    y: &[f64],
116    options: &StandardizeOptions,
117) -> (Matrix, Vec<f64>, StandardizationInfo) {
118    let n = x.rows;
119    let p = x.cols;
120
121    let mut x_std = x.clone();
122    let mut y_std = y.to_vec();
123
124    let mut x_mean = vec![0.0; p];
125    let mut x_scale = vec![1.0; p];
126
127    let y_mean = if options.intercept && !y.is_empty() {
128        vec_mean(y)
129    } else {
130        0.0
131    };
132
133    // Always center y when there's an intercept (required for coordinate descent)
134    // Additionally scale y if standardize_y is requested
135    let y_scale = if options.intercept {
136        // Center y
137        for yi in y_std.iter_mut() {
138            *yi -= y_mean;
139        }
140
141        if options.standardize_y {
142            // Also scale y
143            let y_var = y_std.iter().map(|&yi| yi * yi).sum::<f64>() / n as f64;
144            let y_scale_val = y_var.sqrt();
145            if y_scale_val > 0.0 {
146                for yi in y_std.iter_mut() {
147                    *yi /= y_scale_val;
148                }
149            }
150            Some(y_scale_val)
151        } else {
152            None
153        }
154    } else if options.standardize_y {
155        // No intercept but standardize_y requested - center and scale
156        let y_mean_local = vec_mean(y);
157        for yi in y_std.iter_mut() {
158            *yi -= y_mean_local;
159        }
160        let y_var = y_std.iter().map(|&yi| yi * yi).sum::<f64>() / n as f64;
161        let y_scale_val = y_var.sqrt();
162        if y_scale_val > 0.0 {
163            for yi in y_std.iter_mut() {
164                *yi /= y_scale_val;
165            }
166        }
167        Some(y_scale_val)
168    } else {
169        None
170    };
171
172    // Standardize X columns
173    // If intercept is present, first column is NOT standardized
174    let start_col = if options.intercept { 1 } else { 0 };
175
176    for j in start_col..p {
177        // Compute column mean
178        let mut col_mean = 0.0;
179        for i in 0..n {
180            col_mean += x_std.get(i, j);
181        }
182        col_mean /= n as f64;
183        x_mean[j] = col_mean;
184
185        // Always center the column (required for coordinate descent to work correctly)
186        for i in 0..n {
187            let val = x_std.get(i, j) - col_mean;
188            x_std.set(i, j, val);
189        }
190
191        // Always compute and apply scaling (coordinate descent assumes unit variance)
192        // The standardize_x option only affects how coefficients are returned
193        let mut col_scale_sq = 0.0;
194        for i in 0..n {
195            let val = x_std.get(i, j);
196            col_scale_sq += val * val;
197        }
198        let col_scale = (col_scale_sq / n as f64).sqrt();
199
200        if col_scale > 0.0 {
201            // Always scale internally for correct coordinate descent behavior
202            for i in 0..n {
203                let val = x_std.get(i, j) / col_scale;
204                x_std.set(i, j, val);
205            }
206
207            // Always store actual scale factor for correct unstandardization
208            // (coefficients are always returned on original scale, matching glmnet)
209            x_scale[j] = col_scale;
210        }
211    }
212
213    // If intercept column exists, set its scale to 1.0 (not penalized)
214    if options.intercept && p > 0 {
215        x_scale[0] = 1.0;
216        x_mean[0] = 0.0; // Intercept column has no "mean" to subtract
217    }
218
219    let info = StandardizationInfo {
220        x_mean,
221        x_scale,
222        y_mean,
223        y_scale,
224        intercept: options.intercept,
225        standardized_x: options.standardize_x,
226        standardized_y: options.standardize_y,
227    };
228
229    (x_std, y_std, info)
230}
231
232/// Unstandardizes coefficients from the standardized space back to original scale.
233///
234/// This reverses the standardization transformation to get coefficients that
235/// can be applied to the original (unscaled) data.
236///
237/// # Arguments
238///
239/// * `beta_std` - Coefficients in standardized space (length p)
240/// * `info` - Standardization information from [`standardize_xy`]
241///
242/// # Returns
243///
244/// A tuple `(beta0, beta_slopes)` where:
245/// - `beta0` is the intercept on the original scale
246/// - `beta_slopes` are the slope coefficients only (excluding intercept column coefficient)
247///
248/// # Unstandardization Formula
249///
250/// For non-intercept coefficients:
251/// ```text
252/// β_original[j] = (y_scale * β_std[j]) / x_scale[j]
253/// ```
254///
255/// For the intercept:
256/// ```text
257/// β₀ = y_mean - Σⱼ x_mean[j] * β_original[j]
258/// ```
259///
260/// If y was not standardized, `y_scale = 1`.
261/// If X was not standardized, `x_scale[j] = 1`.
262///
263/// # Note
264///
265/// If `intercept=true` in the info, `beta_std[0]` is assumed to be the intercept
266/// coefficient (which is already 0 in the standardized space since X was centered).
267/// The returned `beta_slopes` will NOT include this zeroed coefficient - only actual
268/// slope coefficients are returned.
269#[allow(clippy::needless_range_loop)]
270pub fn unstandardize_coefficients(beta_std: &[f64], info: &StandardizationInfo) -> (f64, Vec<f64>) {
271    let p = beta_std.len();
272    let y_scale = info.y_scale.unwrap_or(1.0);
273
274    // Determine where slope coefficients start in beta_std
275    let start_idx = if info.intercept { 1 } else { 0 };
276    let n_slopes = p - start_idx;
277
278    // Unstandardize slope coefficients only (exclude intercept column coefficient)
279    let mut beta_slopes = vec![0.0; n_slopes];
280    for j in start_idx..p {
281        let slope_idx = j - start_idx;
282        beta_slopes[slope_idx] = (y_scale * beta_std[j]) / info.x_scale[j];
283    }
284
285    // Compute intercept on original scale
286    // beta0 = y_mean - sum(x_mean[j] * beta_slopes[j-1]) for j in 1..p
287    let beta0 = if info.intercept {
288        let mut sum = 0.0;
289        for j in 1..p {
290            sum += info.x_mean[j] * beta_slopes[j - 1];
291        }
292        info.y_mean - sum
293    } else {
294        0.0
295    };
296
297    (beta0, beta_slopes)
298}
299
300/// Computes predictions using unstandardized coefficients.
301///
302/// # Arguments
303///
304/// * `x_new` - New data matrix (n_new × p, with intercept column if applicable)
305/// * `beta0` - Intercept on original scale
306/// * `beta` - Slope coefficients on original scale (does NOT include intercept column coefficient)
307///
308/// # Returns
309///
310/// Predictions for each row in x_new.
311///
312/// # Note
313///
314/// If `x_new` has an intercept column (first column of all ones), `beta` should have
315/// `p - 1` elements corresponding to the non-intercept columns. If `x_new` has no
316/// intercept column, `beta` should have `p` elements.
317#[allow(clippy::needless_range_loop)]
318pub fn predict(x_new: &Matrix, beta0: f64, beta: &[f64]) -> Vec<f64> {
319    let n = x_new.rows;
320    let p = x_new.cols;
321
322    let mut predictions = vec![0.0; n];
323
324    // Determine if there's an intercept column based on beta length
325    // If beta has one fewer element than columns, first column is intercept
326    let has_intercept_col = beta.len() == p - 1;
327    let start_col = if has_intercept_col { 1 } else { 0 };
328
329    for i in 0..n {
330        let mut sum = beta0;
331        for (j, &beta_j) in beta.iter().enumerate() {
332            let col = start_col + j;
333            if col < p {
334                sum += x_new.get(i, col) * beta_j;
335            }
336        }
337        predictions[i] = sum;
338    }
339
340    predictions
341}
342
343#[cfg(test)]
344mod tests {
345    use super::*;
346
347    #[test]
348    fn test_standardize_xy_with_intercept() {
349        // Simple test data
350        let x_data = vec![1.0, 2.0, 3.0, 1.0, 4.0, 6.0, 1.0, 6.0, 9.0];
351        let x = Matrix::new(3, 3, x_data);
352        let y = vec![3.0, 5.0, 7.0];
353
354        let options = StandardizeOptions {
355            intercept: true,
356            standardize_x: true,
357            standardize_y: false,
358        };
359
360        let (x_std, y_std, info) = standardize_xy(&x, &y, &options);
361
362        // First column (intercept) should be unchanged
363        assert_eq!(x_std.get(0, 0), 1.0);
364        assert_eq!(x_std.get(1, 0), 1.0);
365        assert_eq!(x_std.get(2, 0), 1.0);
366
367        // y is now centered when intercept=true (required for coordinate descent)
368        let y_mean = 5.0; // mean of [3, 5, 7]
369        for i in 0..3 {
370            assert!((y_std[i] - (y[i] - y_mean)).abs() < 1e-10);
371        }
372
373        // x_mean should capture the column means
374        assert_eq!(info.x_mean[0], 0.0); // Intercept column
375        assert!((info.x_mean[1] - 4.0).abs() < 1e-10);
376        assert!((info.x_mean[2] - 6.0).abs() < 1e-10);
377    }
378
379    #[test]
380    fn test_unstandardize_coefficients() {
381        // Create a simple standardization scenario
382        let x_mean = vec![0.0, 4.0, 6.0];
383        let x_scale = vec![1.0, 2.0, 3.0];
384        let y_mean = 5.0;
385        let y_scale = Some(2.0);
386
387        let info = StandardizationInfo {
388            x_mean: x_mean.clone(),
389            x_scale: x_scale.clone(),
390            y_mean,
391            y_scale,
392            intercept: true,
393            standardized_x: true,
394            standardized_y: true,
395        };
396
397        // Coefficients in standardized space: [intercept=0, beta1=1, beta2=2]
398        let beta_std = vec![0.0, 1.0, 2.0];
399
400        let (beta0, beta_slopes) = unstandardize_coefficients(&beta_std, &info);
401
402        // Check unstandardization - beta_slopes now only contains slope coefficients
403        // beta_slopes[0] = (y_scale * beta_std[1]) / x_scale[1] = (2 * 1) / 2 = 1
404        assert!((beta_slopes[0] - 1.0).abs() < 1e-10);
405        // beta_slopes[1] = (y_scale * beta_std[2]) / x_scale[2] = (2 * 2) / 3 = 4/3
406        assert!((beta_slopes[1] - 4.0 / 3.0).abs() < 1e-10);
407
408        // beta0 = y_mean - sum(x_mean[j] * beta_slopes[j-1])
409        //      = 5 - (4 * 1 + 6 * 4/3) = 5 - 4 - 8 = -7
410        assert!((beta0 - (-7.0)).abs() < 1e-10);
411
412        // Verify beta_slopes has the correct length (only slopes, not intercept col coef)
413        assert_eq!(beta_slopes.len(), 2);
414    }
415
416    #[test]
417    fn test_predict() {
418        // X has intercept column (first col all 1s) plus 2 predictors
419        let x_data = vec![1.0, 2.0, 3.0, 1.0, 4.0, 6.0];
420        let x = Matrix::new(2, 3, x_data);
421
422        // beta0 = 1, beta = [2.0, 3.0] (slope coefficients only, no intercept col coef)
423        let beta0 = 1.0;
424        let beta = vec![2.0, 3.0];
425
426        let preds = predict(&x, beta0, &beta);
427
428        // pred[0] = 1 + 2*2 + 3*3 = 1 + 4 + 9 = 14
429        assert!((preds[0] - 14.0).abs() < 1e-10);
430        // pred[1] = 1 + 2*4 + 3*6 = 1 + 8 + 18 = 27
431        assert!((preds[1] - 27.0).abs() < 1e-10);
432    }
433}