Skip to main content

linreg_core/regularized/
preprocess.rs

1//! Data preprocessing for regularized regression.
2//!
3//! This module provides standardization utilities that match glmnet's behavior:
4//!
5//! - Predictors are centered and scaled (if enabled)
6//! - The intercept column is not penalized, so it's handled specially
7//! - Coefficients can be unstandardized back to the original scale
8//!
9//! # Standardization Convention
10//!
11//! The scaling factor used is `sqrt(sum(x²) / n)`, which gives unit variance
12//! under the 1/n convention (matching glmnet).
13
14use crate::linalg::{vec_mean, Matrix};
15
16/// Information stored during standardization, used to unstandardize coefficients.
17///
18/// This struct captures all the information needed to transform coefficients
19/// from the standardized space back to the original data scale.
20///
21/// # Fields
22///
23/// * `x_mean` - Mean of each predictor column (length p)
24/// * `x_scale` - Scale factor for each predictor column (length p)
25/// * `y_mean` - Mean of response variable
26/// * `y_scale` - Scale factor for response (optional, used for lambda path)
27/// * `intercept` - Whether an intercept term was included
28/// * `standardized_x` - Whether X was standardized
29/// * `standardized_y` - Whether y was standardized
30#[derive(Clone, Debug)]
31pub struct StandardizationInfo {
32    /// Mean of each predictor column
33    pub x_mean: Vec<f64>,
34    /// Scale factor for each predictor column
35    pub x_scale: Vec<f64>,
36    /// Mean of response variable
37    pub y_mean: f64,
38    /// Scale factor for response (for lambda path construction)
39    pub y_scale: Option<f64>,
40    /// Whether an intercept was included
41    pub intercept: bool,
42    /// Whether X was standardized
43    pub standardized_x: bool,
44    /// Whether y was standardized
45    pub standardized_y: bool,
46}
47
48/// Options for standardization.
49///
50/// # Fields
51///
52/// * `intercept` - Whether to include/center an intercept (default: true)
53/// * `standardize_x` - Whether to standardize predictors (default: true)
54/// * `standardize_y` - Whether to standardize response (default: false)
55///
56/// # Note
57///
58/// Setting `standardize_y` to `true` is mainly useful when you want to match
59/// glmnet's lambda sequence exactly. For single-lambda fits, you typically
60/// don't need to standardize y.
61#[derive(Clone, Debug)]
62pub struct StandardizeOptions {
63    /// Whether to include an intercept (and center X)
64    pub intercept: bool,
65    /// Whether to standardize predictor columns
66    pub standardize_x: bool,
67    /// Whether to standardize the response variable
68    pub standardize_y: bool,
69}
70
71impl Default for StandardizeOptions {
72    fn default() -> Self {
73        StandardizeOptions {
74            intercept: true,
75            standardize_x: true,
76            standardize_y: false,
77        }
78    }
79}
80
81/// Standardizes X and y for regularized regression (glmnet-compatible).
82///
83/// This function performs the same standardization as glmnet with
84/// `standardize=TRUE`. The first column of X is assumed to be the intercept
85/// (all ones) and is NOT standardized.
86///
87/// # Arguments
88///
89/// * `x` - Design matrix (n × p). First column should be intercept if `intercept=true`.
90/// * `y` - Response vector (n elements)
91/// * `options` - Standardization options
92///
93/// # Returns
94///
95/// A tuple `(x_std, y_std, info)` where:
96/// - `x_std` is the standardized design matrix
97/// - `y_std` is the (optionally) standardized response
98/// - `info` contains the standardization parameters for unstandardization
99///
100/// # Standardization Details
101///
102/// For the intercept column (first column, if present):
103/// - Not centered (stays as ones)
104/// - Not scaled
105///
106/// For other columns (if `standardize_x=true`):
107/// - Centered: `x_centered = x - mean(x)`
108/// - Scaled: `x_scaled = x_centered / sqrt(sum(x²) / n)`
109///
110/// For y (if `standardize_y=true`):
111/// - Centered: `y_centered = y - mean(y)`
112/// - Scaled: `y_scaled = y_centered / sqrt(sum(y²) / n)`
113pub fn standardize_xy(x: &Matrix, y: &[f64], options: &StandardizeOptions) -> (Matrix, Vec<f64>, StandardizationInfo) {
114    let n = x.rows;
115    let p = x.cols;
116
117    let mut x_std = x.clone();
118    let mut y_std = y.to_vec();
119
120    let mut x_mean = vec![0.0; p];
121    let mut x_scale = vec![1.0; p];
122
123    let y_mean = if options.intercept && !y.is_empty() {
124        vec_mean(y)
125    } else {
126        0.0
127    };
128
129    // Always center y when there's an intercept (required for coordinate descent)
130    // Additionally scale y if standardize_y is requested
131    let y_scale = if options.intercept {
132        // Center y
133        for yi in y_std.iter_mut() {
134            *yi -= y_mean;
135        }
136
137        if options.standardize_y {
138            // Also scale y
139            let y_var = y_std.iter().map(|&yi| yi * yi).sum::<f64>() / n as f64;
140            let y_scale_val = y_var.sqrt();
141            if y_scale_val > 0.0 {
142                for yi in y_std.iter_mut() {
143                    *yi /= y_scale_val;
144                }
145            }
146            Some(y_scale_val)
147        } else {
148            None
149        }
150    } else if options.standardize_y {
151        // No intercept but standardize_y requested - center and scale
152        let y_mean_local = vec_mean(y);
153        for yi in y_std.iter_mut() {
154            *yi -= y_mean_local;
155        }
156        let y_var = y_std.iter().map(|&yi| yi * yi).sum::<f64>() / n as f64;
157        let y_scale_val = y_var.sqrt();
158        if y_scale_val > 0.0 {
159            for yi in y_std.iter_mut() {
160                *yi /= y_scale_val;
161            }
162        }
163        Some(y_scale_val)
164    } else {
165        None
166    };
167
168    // Standardize X columns
169    // If intercept is present, first column is NOT standardized
170    let start_col = if options.intercept { 1 } else { 0 };
171
172    for j in start_col..p {
173        // Compute column mean
174        let mut col_mean = 0.0;
175        for i in 0..n {
176            col_mean += x_std.get(i, j);
177        }
178        col_mean /= n as f64;
179        x_mean[j] = col_mean;
180
181        // Always center the column (required for coordinate descent to work correctly)
182        for i in 0..n {
183            let val = x_std.get(i, j) - col_mean;
184            x_std.set(i, j, val);
185        }
186
187        // Always compute and apply scaling (coordinate descent assumes unit variance)
188        // The standardize_x option only affects how coefficients are returned
189        let mut col_scale_sq = 0.0;
190        for i in 0..n {
191            let val = x_std.get(i, j);
192            col_scale_sq += val * val;
193        }
194        let col_scale = (col_scale_sq / n as f64).sqrt();
195
196        if col_scale > 0.0 {
197            // Always scale internally for correct coordinate descent behavior
198            for i in 0..n {
199                let val = x_std.get(i, j) / col_scale;
200                x_std.set(i, j, val);
201            }
202
203            // Always store actual scale factor for correct unstandardization
204            // (coefficients are always returned on original scale, matching glmnet)
205            x_scale[j] = col_scale;
206        }
207    }
208
209    // If intercept column exists, set its scale to 1.0 (not penalized)
210    if options.intercept && p > 0 {
211        x_scale[0] = 1.0;
212        x_mean[0] = 0.0;  // Intercept column has no "mean" to subtract
213    }
214
215    let info = StandardizationInfo {
216        x_mean,
217        x_scale,
218        y_mean,
219        y_scale,
220        intercept: options.intercept,
221        standardized_x: options.standardize_x,
222        standardized_y: options.standardize_y,
223    };
224
225    (x_std, y_std, info)
226}
227
228/// Unstandardizes coefficients from the standardized space back to original scale.
229///
230/// This reverses the standardization transformation to get coefficients that
231/// can be applied to the original (unscaled) data.
232///
233/// # Arguments
234///
235/// * `beta_std` - Coefficients in standardized space (length p)
236/// * `info` - Standardization information from [`standardize_xy`]
237///
238/// # Returns
239///
240/// A tuple `(beta0, beta_slopes)` where:
241/// - `beta0` is the intercept on the original scale
242/// - `beta_slopes` are the slope coefficients only (excluding intercept column coefficient)
243///
244/// # Unstandardization Formula
245///
246/// For non-intercept coefficients:
247/// ```text
248/// β_original[j] = (y_scale * β_std[j]) / x_scale[j]
249/// ```
250///
251/// For the intercept:
252/// ```text
253/// β₀ = y_mean - Σⱼ x_mean[j] * β_original[j]
254/// ```
255///
256/// If y was not standardized, `y_scale = 1`.
257/// If X was not standardized, `x_scale[j] = 1`.
258///
259/// # Note
260///
261/// If `intercept=true` in the info, `beta_std[0]` is assumed to be the intercept
262/// coefficient (which is already 0 in the standardized space since X was centered).
263/// The returned `beta_slopes` will NOT include this zeroed coefficient - only actual
264/// slope coefficients are returned.
265pub fn unstandardize_coefficients(beta_std: &[f64], info: &StandardizationInfo) -> (f64, Vec<f64>) {
266    let p = beta_std.len();
267    let y_scale = info.y_scale.unwrap_or(1.0);
268
269    // Determine where slope coefficients start in beta_std
270    let start_idx = if info.intercept { 1 } else { 0 };
271    let n_slopes = p - start_idx;
272
273    // Unstandardize slope coefficients only (exclude intercept column coefficient)
274    let mut beta_slopes = vec![0.0; n_slopes];
275    for j in start_idx..p {
276        let slope_idx = j - start_idx;
277        beta_slopes[slope_idx] = (y_scale * beta_std[j]) / info.x_scale[j];
278    }
279
280    // Compute intercept on original scale
281    // beta0 = y_mean - sum(x_mean[j] * beta_slopes[j-1]) for j in 1..p
282    let beta0 = if info.intercept {
283        let mut sum = 0.0;
284        for j in 1..p {
285            sum += info.x_mean[j] * beta_slopes[j - 1];
286        }
287        info.y_mean - sum
288    } else {
289        0.0
290    };
291
292    (beta0, beta_slopes)
293}
294
295/// Computes predictions using unstandardized coefficients.
296///
297/// # Arguments
298///
299/// * `x_new` - New data matrix (n_new × p, with intercept column if applicable)
300/// * `beta0` - Intercept on original scale
301/// * `beta` - Slope coefficients on original scale (does NOT include intercept column coefficient)
302///
303/// # Returns
304///
305/// Predictions for each row in x_new.
306///
307/// # Note
308///
309/// If `x_new` has an intercept column (first column of all ones), `beta` should have
310/// `p - 1` elements corresponding to the non-intercept columns. If `x_new` has no
311/// intercept column, `beta` should have `p` elements.
312pub fn predict(x_new: &Matrix, beta0: f64, beta: &[f64]) -> Vec<f64> {
313    let n = x_new.rows;
314    let p = x_new.cols;
315
316    let mut predictions = vec![0.0; n];
317
318    // Determine if there's an intercept column based on beta length
319    // If beta has one fewer element than columns, first column is intercept
320    let has_intercept_col = beta.len() == p - 1;
321    let start_col = if has_intercept_col { 1 } else { 0 };
322
323    for i in 0..n {
324        let mut sum = beta0;
325        for (j, &beta_j) in beta.iter().enumerate() {
326            let col = start_col + j;
327            if col < p {
328                sum += x_new.get(i, col) * beta_j;
329            }
330        }
331        predictions[i] = sum;
332    }
333
334    predictions
335}
336
337#[cfg(test)]
338mod tests {
339    use super::*;
340
341    #[test]
342    fn test_standardize_xy_with_intercept() {
343        // Simple test data
344        let x_data = vec![
345            1.0, 2.0, 3.0,
346            1.0, 4.0, 6.0,
347            1.0, 6.0, 9.0,
348        ];
349        let x = Matrix::new(3, 3, x_data);
350        let y = vec![3.0, 5.0, 7.0];
351
352        let options = StandardizeOptions {
353            intercept: true,
354            standardize_x: true,
355            standardize_y: false,
356        };
357
358        let (x_std, y_std, info) = standardize_xy(&x, &y, &options);
359
360        // First column (intercept) should be unchanged
361        assert_eq!(x_std.get(0, 0), 1.0);
362        assert_eq!(x_std.get(1, 0), 1.0);
363        assert_eq!(x_std.get(2, 0), 1.0);
364
365        // y is now centered when intercept=true (required for coordinate descent)
366        let y_mean = 5.0; // mean of [3, 5, 7]
367        for i in 0..3 {
368            assert!((y_std[i] - (y[i] - y_mean)).abs() < 1e-10);
369        }
370
371        // x_mean should capture the column means
372        assert_eq!(info.x_mean[0], 0.0); // Intercept column
373        assert!((info.x_mean[1] - 4.0).abs() < 1e-10);
374        assert!((info.x_mean[2] - 6.0).abs() < 1e-10);
375    }
376
377    #[test]
378    fn test_unstandardize_coefficients() {
379        // Create a simple standardization scenario
380        let x_mean = vec![0.0, 4.0, 6.0];
381        let x_scale = vec![1.0, 2.0, 3.0];
382        let y_mean = 5.0;
383        let y_scale = Some(2.0);
384
385        let info = StandardizationInfo {
386            x_mean: x_mean.clone(),
387            x_scale: x_scale.clone(),
388            y_mean,
389            y_scale,
390            intercept: true,
391            standardized_x: true,
392            standardized_y: true,
393        };
394
395        // Coefficients in standardized space: [intercept=0, beta1=1, beta2=2]
396        let beta_std = vec![0.0, 1.0, 2.0];
397
398        let (beta0, beta_slopes) = unstandardize_coefficients(&beta_std, &info);
399
400        // Check unstandardization - beta_slopes now only contains slope coefficients
401        // beta_slopes[0] = (y_scale * beta_std[1]) / x_scale[1] = (2 * 1) / 2 = 1
402        assert!((beta_slopes[0] - 1.0).abs() < 1e-10);
403        // beta_slopes[1] = (y_scale * beta_std[2]) / x_scale[2] = (2 * 2) / 3 = 4/3
404        assert!((beta_slopes[1] - 4.0/3.0).abs() < 1e-10);
405
406        // beta0 = y_mean - sum(x_mean[j] * beta_slopes[j-1])
407        //      = 5 - (4 * 1 + 6 * 4/3) = 5 - 4 - 8 = -7
408        assert!((beta0 - (-7.0)).abs() < 1e-10);
409
410        // Verify beta_slopes has the correct length (only slopes, not intercept col coef)
411        assert_eq!(beta_slopes.len(), 2);
412    }
413
414    #[test]
415    fn test_predict() {
416        // X has intercept column (first col all 1s) plus 2 predictors
417        let x_data = vec![
418            1.0, 2.0, 3.0,
419            1.0, 4.0, 6.0,
420        ];
421        let x = Matrix::new(2, 3, x_data);
422
423        // beta0 = 1, beta = [2.0, 3.0] (slope coefficients only, no intercept col coef)
424        let beta0 = 1.0;
425        let beta = vec![2.0, 3.0];
426
427        let preds = predict(&x, beta0, &beta);
428
429        // pred[0] = 1 + 2*2 + 3*3 = 1 + 4 + 9 = 14
430        assert!((preds[0] - 14.0).abs() < 1e-10);
431        // pred[1] = 1 + 2*4 + 3*6 = 1 + 8 + 18 = 27
432        assert!((preds[1] - 27.0).abs() < 1e-10);
433    }
434}