linreg_core/regularized/preprocess.rs
1//! Data preprocessing for regularized regression.
2//!
3//! This module provides standardization utilities that match glmnet's behavior:
4//!
5//! - Predictors are centered and scaled (if enabled)
6//! - The intercept column is not penalized, so it's handled specially
7//! - Coefficients can be unstandardized back to the original scale
8//!
9//! # Standardization Convention
10//!
11//! The scaling factor used is `sqrt(sum(x²) / n)`, which gives unit variance
12//! under the 1/n convention (matching glmnet).
13
14use crate::linalg::{vec_mean, Matrix};
15
16/// Information stored during standardization, used to unstandardize coefficients.
17///
18/// This struct captures all the information needed to transform coefficients
19/// from the standardized space back to the original data scale.
20///
21/// # Fields
22///
23/// * `x_mean` - Mean of each predictor column (length p)
24/// * `x_scale` - Scale factor for each predictor column (length p)
25/// * `y_mean` - Mean of response variable
26/// * `y_scale` - Scale factor for response (optional, used for lambda path)
27/// * `intercept` - Whether an intercept term was included
28/// * `standardized_x` - Whether X was standardized
29/// * `standardized_y` - Whether y was standardized
30#[derive(Clone, Debug)]
31pub struct StandardizationInfo {
32 /// Mean of each predictor column
33 pub x_mean: Vec<f64>,
34 /// Scale factor for each predictor column
35 pub x_scale: Vec<f64>,
36 /// Mean of response variable
37 pub y_mean: f64,
38 /// Scale factor for response (for lambda path construction)
39 pub y_scale: Option<f64>,
40 /// Whether an intercept was included
41 pub intercept: bool,
42 /// Whether X was standardized
43 pub standardized_x: bool,
44 /// Whether y was standardized
45 pub standardized_y: bool,
46}
47
48/// Options for standardization.
49///
50/// # Fields
51///
52/// * `intercept` - Whether to include/center an intercept (default: true)
53/// * `standardize_x` - Whether to standardize predictors (default: true)
54/// * `standardize_y` - Whether to standardize response (default: false)
55///
56/// # Note
57///
58/// Setting `standardize_y` to `true` is mainly useful when you want to match
59/// glmnet's lambda sequence exactly. For single-lambda fits, you typically
60/// don't need to standardize y.
61#[derive(Clone, Debug)]
62pub struct StandardizeOptions {
63 /// Whether to include an intercept (and center X)
64 pub intercept: bool,
65 /// Whether to standardize predictor columns
66 pub standardize_x: bool,
67 /// Whether to standardize the response variable
68 pub standardize_y: bool,
69}
70
71impl Default for StandardizeOptions {
72 fn default() -> Self {
73 StandardizeOptions {
74 intercept: true,
75 standardize_x: true,
76 standardize_y: false,
77 }
78 }
79}
80
81/// Standardizes X and y for regularized regression (glmnet-compatible).
82///
83/// This function performs the same standardization as glmnet with
84/// `standardize=TRUE`. The first column of X is assumed to be the intercept
85/// (all ones) and is NOT standardized.
86///
87/// # Arguments
88///
89/// * `x` - Design matrix (n × p). First column should be intercept if `intercept=true`.
90/// * `y` - Response vector (n elements)
91/// * `options` - Standardization options
92///
93/// # Returns
94///
95/// A tuple `(x_std, y_std, info)` where:
96/// - `x_std` is the standardized design matrix
97/// - `y_std` is the (optionally) standardized response
98/// - `info` contains the standardization parameters for unstandardization
99///
100/// # Standardization Details
101///
102/// For the intercept column (first column, if present):
103/// - Not centered (stays as ones)
104/// - Not scaled
105///
106/// For other columns (if `standardize_x=true`):
107/// - Centered: `x_centered = x - mean(x)`
108/// - Scaled: `x_scaled = x_centered / sqrt(sum(x²) / n)`
109///
110/// For y (if `standardize_y=true`):
111/// - Centered: `y_centered = y - mean(y)`
112/// - Scaled: `y_scaled = y_centered / sqrt(sum(y²) / n)`
113pub fn standardize_xy(x: &Matrix, y: &[f64], options: &StandardizeOptions) -> (Matrix, Vec<f64>, StandardizationInfo) {
114 let n = x.rows;
115 let p = x.cols;
116
117 let mut x_std = x.clone();
118 let mut y_std = y.to_vec();
119
120 let mut x_mean = vec![0.0; p];
121 let mut x_scale = vec![1.0; p];
122
123 let y_mean = if options.intercept && !y.is_empty() {
124 vec_mean(y)
125 } else {
126 0.0
127 };
128
129 // Always center y when there's an intercept (required for coordinate descent)
130 // Additionally scale y if standardize_y is requested
131 let y_scale = if options.intercept {
132 // Center y
133 for yi in y_std.iter_mut() {
134 *yi -= y_mean;
135 }
136
137 if options.standardize_y {
138 // Also scale y
139 let y_var = y_std.iter().map(|&yi| yi * yi).sum::<f64>() / n as f64;
140 let y_scale_val = y_var.sqrt();
141 if y_scale_val > 0.0 {
142 for yi in y_std.iter_mut() {
143 *yi /= y_scale_val;
144 }
145 }
146 Some(y_scale_val)
147 } else {
148 None
149 }
150 } else if options.standardize_y {
151 // No intercept but standardize_y requested - center and scale
152 let y_mean_local = vec_mean(y);
153 for yi in y_std.iter_mut() {
154 *yi -= y_mean_local;
155 }
156 let y_var = y_std.iter().map(|&yi| yi * yi).sum::<f64>() / n as f64;
157 let y_scale_val = y_var.sqrt();
158 if y_scale_val > 0.0 {
159 for yi in y_std.iter_mut() {
160 *yi /= y_scale_val;
161 }
162 }
163 Some(y_scale_val)
164 } else {
165 None
166 };
167
168 // Standardize X columns
169 // If intercept is present, first column is NOT standardized
170 let start_col = if options.intercept { 1 } else { 0 };
171
172 for j in start_col..p {
173 // Compute column mean
174 let mut col_mean = 0.0;
175 for i in 0..n {
176 col_mean += x_std.get(i, j);
177 }
178 col_mean /= n as f64;
179 x_mean[j] = col_mean;
180
181 // Always center the column (required for coordinate descent to work correctly)
182 for i in 0..n {
183 let val = x_std.get(i, j) - col_mean;
184 x_std.set(i, j, val);
185 }
186
187 // Always compute and apply scaling (coordinate descent assumes unit variance)
188 // The standardize_x option only affects how coefficients are returned
189 let mut col_scale_sq = 0.0;
190 for i in 0..n {
191 let val = x_std.get(i, j);
192 col_scale_sq += val * val;
193 }
194 let col_scale = (col_scale_sq / n as f64).sqrt();
195
196 if col_scale > 0.0 {
197 // Always scale internally for correct coordinate descent behavior
198 for i in 0..n {
199 let val = x_std.get(i, j) / col_scale;
200 x_std.set(i, j, val);
201 }
202
203 // Always store actual scale factor for correct unstandardization
204 // (coefficients are always returned on original scale, matching glmnet)
205 x_scale[j] = col_scale;
206 }
207 }
208
209 // If intercept column exists, set its scale to 1.0 (not penalized)
210 if options.intercept && p > 0 {
211 x_scale[0] = 1.0;
212 x_mean[0] = 0.0; // Intercept column has no "mean" to subtract
213 }
214
215 let info = StandardizationInfo {
216 x_mean,
217 x_scale,
218 y_mean,
219 y_scale,
220 intercept: options.intercept,
221 standardized_x: options.standardize_x,
222 standardized_y: options.standardize_y,
223 };
224
225 (x_std, y_std, info)
226}
227
228/// Unstandardizes coefficients from the standardized space back to original scale.
229///
230/// This reverses the standardization transformation to get coefficients that
231/// can be applied to the original (unscaled) data.
232///
233/// # Arguments
234///
235/// * `beta_std` - Coefficients in standardized space (length p)
236/// * `info` - Standardization information from [`standardize_xy`]
237///
238/// # Returns
239///
240/// A tuple `(beta0, beta_slopes)` where:
241/// - `beta0` is the intercept on the original scale
242/// - `beta_slopes` are the slope coefficients only (excluding intercept column coefficient)
243///
244/// # Unstandardization Formula
245///
246/// For non-intercept coefficients:
247/// ```text
248/// β_original[j] = (y_scale * β_std[j]) / x_scale[j]
249/// ```
250///
251/// For the intercept:
252/// ```text
253/// β₀ = y_mean - Σⱼ x_mean[j] * β_original[j]
254/// ```
255///
256/// If y was not standardized, `y_scale = 1`.
257/// If X was not standardized, `x_scale[j] = 1`.
258///
259/// # Note
260///
261/// If `intercept=true` in the info, `beta_std[0]` is assumed to be the intercept
262/// coefficient (which is already 0 in the standardized space since X was centered).
263/// The returned `beta_slopes` will NOT include this zeroed coefficient - only actual
264/// slope coefficients are returned.
265pub fn unstandardize_coefficients(beta_std: &[f64], info: &StandardizationInfo) -> (f64, Vec<f64>) {
266 let p = beta_std.len();
267 let y_scale = info.y_scale.unwrap_or(1.0);
268
269 // Determine where slope coefficients start in beta_std
270 let start_idx = if info.intercept { 1 } else { 0 };
271 let n_slopes = p - start_idx;
272
273 // Unstandardize slope coefficients only (exclude intercept column coefficient)
274 let mut beta_slopes = vec![0.0; n_slopes];
275 for j in start_idx..p {
276 let slope_idx = j - start_idx;
277 beta_slopes[slope_idx] = (y_scale * beta_std[j]) / info.x_scale[j];
278 }
279
280 // Compute intercept on original scale
281 // beta0 = y_mean - sum(x_mean[j] * beta_slopes[j-1]) for j in 1..p
282 let beta0 = if info.intercept {
283 let mut sum = 0.0;
284 for j in 1..p {
285 sum += info.x_mean[j] * beta_slopes[j - 1];
286 }
287 info.y_mean - sum
288 } else {
289 0.0
290 };
291
292 (beta0, beta_slopes)
293}
294
295/// Computes predictions using unstandardized coefficients.
296///
297/// # Arguments
298///
299/// * `x_new` - New data matrix (n_new × p, with intercept column if applicable)
300/// * `beta0` - Intercept on original scale
301/// * `beta` - Slope coefficients on original scale (does NOT include intercept column coefficient)
302///
303/// # Returns
304///
305/// Predictions for each row in x_new.
306///
307/// # Note
308///
309/// If `x_new` has an intercept column (first column of all ones), `beta` should have
310/// `p - 1` elements corresponding to the non-intercept columns. If `x_new` has no
311/// intercept column, `beta` should have `p` elements.
312pub fn predict(x_new: &Matrix, beta0: f64, beta: &[f64]) -> Vec<f64> {
313 let n = x_new.rows;
314 let p = x_new.cols;
315
316 let mut predictions = vec![0.0; n];
317
318 // Determine if there's an intercept column based on beta length
319 // If beta has one fewer element than columns, first column is intercept
320 let has_intercept_col = beta.len() == p - 1;
321 let start_col = if has_intercept_col { 1 } else { 0 };
322
323 for i in 0..n {
324 let mut sum = beta0;
325 for (j, &beta_j) in beta.iter().enumerate() {
326 let col = start_col + j;
327 if col < p {
328 sum += x_new.get(i, col) * beta_j;
329 }
330 }
331 predictions[i] = sum;
332 }
333
334 predictions
335}
336
337#[cfg(test)]
338mod tests {
339 use super::*;
340
341 #[test]
342 fn test_standardize_xy_with_intercept() {
343 // Simple test data
344 let x_data = vec![
345 1.0, 2.0, 3.0,
346 1.0, 4.0, 6.0,
347 1.0, 6.0, 9.0,
348 ];
349 let x = Matrix::new(3, 3, x_data);
350 let y = vec![3.0, 5.0, 7.0];
351
352 let options = StandardizeOptions {
353 intercept: true,
354 standardize_x: true,
355 standardize_y: false,
356 };
357
358 let (x_std, y_std, info) = standardize_xy(&x, &y, &options);
359
360 // First column (intercept) should be unchanged
361 assert_eq!(x_std.get(0, 0), 1.0);
362 assert_eq!(x_std.get(1, 0), 1.0);
363 assert_eq!(x_std.get(2, 0), 1.0);
364
365 // y is now centered when intercept=true (required for coordinate descent)
366 let y_mean = 5.0; // mean of [3, 5, 7]
367 for i in 0..3 {
368 assert!((y_std[i] - (y[i] - y_mean)).abs() < 1e-10);
369 }
370
371 // x_mean should capture the column means
372 assert_eq!(info.x_mean[0], 0.0); // Intercept column
373 assert!((info.x_mean[1] - 4.0).abs() < 1e-10);
374 assert!((info.x_mean[2] - 6.0).abs() < 1e-10);
375 }
376
377 #[test]
378 fn test_unstandardize_coefficients() {
379 // Create a simple standardization scenario
380 let x_mean = vec![0.0, 4.0, 6.0];
381 let x_scale = vec![1.0, 2.0, 3.0];
382 let y_mean = 5.0;
383 let y_scale = Some(2.0);
384
385 let info = StandardizationInfo {
386 x_mean: x_mean.clone(),
387 x_scale: x_scale.clone(),
388 y_mean,
389 y_scale,
390 intercept: true,
391 standardized_x: true,
392 standardized_y: true,
393 };
394
395 // Coefficients in standardized space: [intercept=0, beta1=1, beta2=2]
396 let beta_std = vec![0.0, 1.0, 2.0];
397
398 let (beta0, beta_slopes) = unstandardize_coefficients(&beta_std, &info);
399
400 // Check unstandardization - beta_slopes now only contains slope coefficients
401 // beta_slopes[0] = (y_scale * beta_std[1]) / x_scale[1] = (2 * 1) / 2 = 1
402 assert!((beta_slopes[0] - 1.0).abs() < 1e-10);
403 // beta_slopes[1] = (y_scale * beta_std[2]) / x_scale[2] = (2 * 2) / 3 = 4/3
404 assert!((beta_slopes[1] - 4.0/3.0).abs() < 1e-10);
405
406 // beta0 = y_mean - sum(x_mean[j] * beta_slopes[j-1])
407 // = 5 - (4 * 1 + 6 * 4/3) = 5 - 4 - 8 = -7
408 assert!((beta0 - (-7.0)).abs() < 1e-10);
409
410 // Verify beta_slopes has the correct length (only slopes, not intercept col coef)
411 assert_eq!(beta_slopes.len(), 2);
412 }
413
414 #[test]
415 fn test_predict() {
416 // X has intercept column (first col all 1s) plus 2 predictors
417 let x_data = vec![
418 1.0, 2.0, 3.0,
419 1.0, 4.0, 6.0,
420 ];
421 let x = Matrix::new(2, 3, x_data);
422
423 // beta0 = 1, beta = [2.0, 3.0] (slope coefficients only, no intercept col coef)
424 let beta0 = 1.0;
425 let beta = vec![2.0, 3.0];
426
427 let preds = predict(&x, beta0, &beta);
428
429 // pred[0] = 1 + 2*2 + 3*3 = 1 + 4 + 9 = 14
430 assert!((preds[0] - 14.0).abs() < 1e-10);
431 // pred[1] = 1 + 2*4 + 3*6 = 1 + 8 + 18 = 27
432 assert!((preds[1] - 27.0).abs() < 1e-10);
433 }
434}