linreg_core/regularized/elastic_net.rs
1//! Elastic Net regression (L1 + L2 regularized linear regression).
2//!
3//! This module provides a generalized elastic net implementation using cyclical
4//! coordinate descent with soft-thresholding and active set convergence strategies.
5//! It serves as the core engine for both Lasso (`alpha=1.0`) and Ridge (`alpha=0.0`).
6//!
7//! # Objective Function
8//!
9//! Minimizes over `(β₀, β)`:
10//!
11//! ```text
12//! (1/(2n)) * ||y - β₀ - Xβ||² + λ * [ (1-α)||β||₂²/2 + α||β||₁ ]
13//! ```
14//!
15//! Note on scaling: The internal implementation works with standardized data (unit norm columns).
16//! The lambda parameter is adjusted internally to match the scale expected by the formulation above.
17
18use crate::core::{aic, bic, log_likelihood};
19use crate::error::{Error, Result};
20use crate::linalg::Matrix;
21use crate::regularized::preprocess::{
22 predict, standardize_xy, unstandardize_coefficients, StandardizeOptions,
23};
24use crate::serialization::types::ModelType;
25use crate::impl_serialization;
26use serde::{Deserialize, Serialize};
27
28/// Soft-thresholding operator: S(z, γ) = sign(z) * max(|z| - γ, 0)
29///
30/// This is the key operation in Lasso and Elastic Net regression that applies
31/// the L1 penalty, producing sparse solutions by shrinking small values to zero.
32///
33/// # Arguments
34///
35/// * `z` - Input value to be thresholded
36/// * `gamma` - Threshold value (must be non-negative)
37///
38/// # Returns
39///
40/// - `z - gamma` if `z > gamma`
41/// - `z + gamma` if `z < -gamma`
42/// - `0` otherwise (when `|z| <= gamma`)
43///
44/// # Panics
45///
46/// Panics if `gamma` is negative.
47///
48/// # Example
49///
50/// ```
51/// # use linreg_core::regularized::elastic_net::soft_threshold;
52/// // Values above threshold are reduced
53/// assert_eq!(soft_threshold(5.0, 2.0), 3.0);
54///
55/// // Values below threshold are set to zero
56/// assert_eq!(soft_threshold(1.0, 2.0), 0.0);
57///
58/// // Negative values work symmetrically
59/// assert_eq!(soft_threshold(-5.0, 2.0), -3.0);
60/// assert_eq!(soft_threshold(-1.0, 2.0), 0.0);
61/// ```
62#[inline]
63pub fn soft_threshold(z: f64, gamma: f64) -> f64 {
64 if gamma < 0.0 {
65 panic!("Soft threshold gamma must be non-negative");
66 }
67 if z > gamma {
68 z - gamma
69 } else if z < -gamma {
70 z + gamma
71 } else {
72 0.0
73 }
74}
75
76/// Options for elastic net fitting.
77///
78/// Configuration options for elastic net regression, which combines L1 and L2 penalties.
79///
80/// # Fields
81///
82/// - `lambda` - Regularization strength (≥ 0, higher = more regularization)
83/// - `alpha` - Mixing parameter (0 = Ridge, 1 = Lasso, 0.5 = equal mix)
84/// - `intercept` - Whether to include an intercept term
85/// - `standardize` - Whether to standardize predictors to unit variance
86/// - `max_iter` - Maximum coordinate descent iterations
87/// - `tol` - Convergence tolerance on coefficient changes
88/// - `penalty_factor` - Optional per-feature penalty multipliers
89/// - `warm_start` - Optional initial coefficient values for warm starts
90/// - `weights` - Optional observation weights
91/// - `coefficient_bounds` - Optional (lower, upper) bounds for each coefficient
92///
93/// # Example
94///
95/// ```
96/// # use linreg_core::regularized::elastic_net::ElasticNetOptions;
97/// let options = ElasticNetOptions {
98/// lambda: 0.1,
99/// alpha: 0.5, // Equal mix of L1 and L2
100/// intercept: true,
101/// standardize: true,
102/// ..Default::default()
103/// };
104/// ```
105#[derive(Clone, Debug)]
106pub struct ElasticNetOptions {
107 /// Regularization strength (lambda >= 0)
108 pub lambda: f64,
109 /// Elastic net mixing parameter (0 <= alpha <= 1).
110 /// alpha=1 is Lasso, alpha=0 is Ridge.
111 pub alpha: f64,
112 /// Whether to include an intercept term
113 pub intercept: bool,
114 /// Whether to standardize predictors
115 pub standardize: bool,
116 /// Maximum coordinate descent iterations
117 pub max_iter: usize,
118 /// Convergence tolerance on coefficient changes
119 pub tol: f64,
120 /// Per-feature penalty factors (optional).
121 /// If None, all features have penalty factor 1.0.
122 pub penalty_factor: Option<Vec<f64>>,
123 /// Initial coefficients for warm start (optional).
124 /// If provided, optimization starts from these values instead of zero.
125 /// Used for efficient pathwise coordinate descent.
126 pub warm_start: Option<Vec<f64>>,
127 /// Observation weights (optional).
128 /// If provided, must have length equal to the number of observations.
129 /// Weights are normalized to sum to 1 internally.
130 pub weights: Option<Vec<f64>>,
131 /// Coefficient bounds: (lower, upper) for each predictor.
132 /// If None, uses (-inf, +inf) for all coefficients (no bounds).
133 ///
134 /// The bounds vector length must equal the number of predictors (excluding intercept).
135 /// For each predictor, the coefficient will be clamped to [lower, upper] after
136 /// each coordinate descent update.
137 ///
138 /// # Examples
139 /// * Non-negative least squares: `Some(vec![(0.0, f64::INFINITY); p])`
140 /// * Upper bound only: `Some(vec![(-f64::INFINITY, 10.0); p])`
141 /// * Both bounds: `Some(vec![(-5.0, 5.0); p])`
142 ///
143 /// # Notes
144 /// * Bounds are applied to coefficients on the ORIGINAL scale, not standardized scale
145 /// * The intercept is never bounded
146 /// * Each pair must satisfy `lower <= upper`
147 pub coefficient_bounds: Option<Vec<(f64, f64)>>,
148}
149
150impl Default for ElasticNetOptions {
151 fn default() -> Self {
152 ElasticNetOptions {
153 lambda: 1.0,
154 alpha: 1.0, // Lasso default
155 intercept: true,
156 standardize: true,
157 max_iter: 100000,
158 tol: 1e-7,
159 penalty_factor: None,
160 warm_start: None,
161 weights: None,
162 coefficient_bounds: None,
163 }
164 }
165}
166
167/// Result of an elastic net fit.
168///
169/// Contains the fitted model coefficients, convergence information, and diagnostic metrics.
170///
171/// # Fields
172///
173/// - `lambda` - The regularization strength used
174/// - `alpha` - The elastic net mixing parameter (0 = Ridge, 1 = Lasso)
175/// - `intercept` - Intercept coefficient (never penalized)
176/// - `coefficients` - Slope coefficients (may be sparse for high alpha)
177/// - `fitted_values` - Predicted values on training data
178/// - `residuals` - Residuals (y - fitted_values)
179/// - `n_nonzero` - Number of non-zero coefficients (excluding intercept)
180/// - `iterations` - Number of coordinate descent iterations performed
181/// - `converged` - Whether the algorithm converged
182/// - `r_squared` - Coefficient of determination
183/// - `adj_r_squared` - Adjusted R²
184/// - `mse` - Mean squared error
185/// - `rmse` - Root mean squared error
186/// - `mae` - Mean absolute error
187/// - `log_likelihood` - Log-likelihood of the model (for model comparison)
188/// - `aic` - Akaike Information Criterion (lower = better)
189/// - `bic` - Bayesian Information Criterion (lower = better)
190///
191/// # Example
192///
193/// ```
194/// # use linreg_core::regularized::elastic_net::{elastic_net_fit, ElasticNetOptions};
195/// # use linreg_core::linalg::Matrix;
196/// # let y = vec![2.0, 4.0, 6.0, 8.0];
197/// # let x = Matrix::new(4, 2, vec![1.0, 1.0, 1.0, 2.0, 1.0, 3.0, 1.0, 4.0]);
198/// # let options = ElasticNetOptions { lambda: 0.1, alpha: 0.5, intercept: true, standardize: true, ..Default::default() };
199/// let fit = elastic_net_fit(&x, &y, &options).unwrap();
200///
201/// // Access fit results
202/// println!("Lambda: {}, Alpha: {}", fit.lambda, fit.alpha);
203/// println!("Non-zero coefficients: {}", fit.n_nonzero);
204/// println!("Converged: {}", fit.converged);
205/// println!("R²: {}", fit.r_squared);
206/// println!("AIC: {}", fit.aic);
207/// # Ok::<(), linreg_core::Error>(())
208/// ```
209#[derive(Clone, Debug, Serialize, Deserialize)]
210pub struct ElasticNetFit {
211 pub lambda: f64,
212 pub alpha: f64,
213 pub intercept: f64,
214 pub coefficients: Vec<f64>,
215 pub fitted_values: Vec<f64>,
216 pub residuals: Vec<f64>,
217 pub n_nonzero: usize,
218 pub iterations: usize,
219 pub converged: bool,
220 pub r_squared: f64,
221 pub adj_r_squared: f64,
222 pub mse: f64,
223 pub rmse: f64,
224 pub mae: f64,
225 pub log_likelihood: f64,
226 pub aic: f64,
227 pub bic: f64,
228}
229
230use crate::regularized::path::{make_lambda_path, LambdaPathOptions};
231
232/// Fits an elastic net regularization path.
233///
234/// This is the most efficient way to fit models for multiple lambda values.
235/// It performs data standardization once and uses warm starts to speed up
236/// convergence along the path.
237///
238/// # Arguments
239///
240/// * `x` - Design matrix
241/// * `y` - Response vector
242/// * `path_options` - Options for generating the lambda path
243/// * `fit_options` - Options for the elastic net fit (alpha, tol, etc.)
244///
245/// # Returns
246///
247/// A vector of `ElasticNetFit` structs, one for each lambda in the path.
248///
249/// # Example
250///
251/// ```
252/// # use linreg_core::regularized::elastic_net::{elastic_net_path, ElasticNetOptions};
253/// # use linreg_core::regularized::path::LambdaPathOptions;
254/// # use linreg_core::linalg::Matrix;
255/// let y = vec![2.0, 4.0, 6.0, 8.0];
256/// let x = Matrix::new(4, 2, vec![1.0, 1.0, 1.0, 2.0, 1.0, 3.0, 1.0, 4.0]);
257///
258/// let path_options = LambdaPathOptions {
259/// nlambda: 10,
260/// ..Default::default()
261/// };
262/// let fit_options = ElasticNetOptions {
263/// alpha: 0.5,
264/// ..Default::default()
265/// };
266///
267/// let path = elastic_net_path(&x, &y, &path_options, &fit_options).unwrap();
268/// assert_eq!(path.len(), 10); // One fit per lambda
269///
270/// // First model has strongest regularization (fewest non-zero coefficients)
271/// println!("Non-zero at lambda_max: {}", path[0].n_nonzero);
272/// // Last model has weakest regularization (most non-zero coefficients)
273/// println!("Non-zero at lambda_min: {}", path.last().unwrap().n_nonzero);
274/// # Ok::<(), linreg_core::Error>(())
275/// ```
276pub fn elastic_net_path(
277 x: &Matrix,
278 y: &[f64],
279 path_options: &LambdaPathOptions,
280 fit_options: &ElasticNetOptions,
281) -> Result<Vec<ElasticNetFit>> {
282 let n = x.rows;
283 let p = x.cols;
284
285 if y.len() != n {
286 return Err(Error::DimensionMismatch(format!(
287 "Length of y ({}) must match number of rows in X ({})",
288 y.len(), n
289 )));
290 }
291
292 // 1. Standardize X and y ONCE
293 let standardization_options = StandardizeOptions {
294 intercept: fit_options.intercept,
295 standardize_x: fit_options.standardize,
296 standardize_y: fit_options.intercept,
297 weights: fit_options.weights.clone(),
298 };
299
300 let (x_standardized, y_standardized, standardization_info) = standardize_xy(x, y, &standardization_options);
301
302 // 2. Generate lambda path
303 // If lambdas are not provided in options (which they aren't in LambdaPathOptions,
304 // it just controls generation), we generate them.
305 // NOTE: If the user wants specific lambdas, they should probably use a different API
306 // or we could add `lambdas: Option<&[f64]>` to this function.
307 // For now, we strictly generate them.
308
309 // We need to account for penalty factors in lambda generation if provided
310 let intercept_col = if fit_options.intercept { Some(0) } else { None };
311 let lambdas = make_lambda_path(
312 &x_standardized,
313 &y_standardized, // y_standardized is centered if intercept=true
314 path_options,
315 fit_options.penalty_factor.as_deref(),
316 intercept_col
317 );
318
319 // 3. Loop over lambdas with warm starts
320 let mut fits = Vec::with_capacity(lambdas.len());
321 let mut coefficients_standardized = vec![0.0; p]; // Initialize at 0
322
323 // Determine unpenalized columns
324 let first_penalized_column_index = if fit_options.intercept { 1 } else { 0 };
325
326 // Calculate scale factor for converting Internal lambdas to Public (user-facing) lambdas
327 // make_lambda_path returns Internal lambdas (for standardized data)
328 // We use these directly in the solver, but scale them for user reporting
329 let y_scale_factor = standardization_info.y_scale.unwrap_or(1.0);
330 // Public lambda = Internal lambda * y_scale_factor
331 // This converts from standardized scale to original data scale
332 let lambda_conversion_factor = if y_scale_factor > 1e-12 {
333 y_scale_factor
334 } else {
335 1.0
336 };
337
338 for &lambda_standardized_value in &lambdas {
339 // The path generation returns lambdas on the internal scale (for standardized data),
340 // which are used directly in coordinate descent without additional scaling.
341 let lambda_standardized = lambda_standardized_value;
342
343 // Transform coefficient bounds to standardized scale
344 // Bounds on original scale need to be converted: coefficients_standardized = beta_orig * x_scale / y_scale
345 let bounds_standardized: Option<Vec<(f64, f64)>> = fit_options.coefficient_bounds.as_ref().map(|bounds| {
346 let y_scale = standardization_info.y_scale.unwrap_or(1.0);
347 bounds.iter().enumerate().map(|(j, &(lower, upper))| {
348 // For each predictor j in original scale, the corresponding column
349 // in the standardized matrix is at index j+1 (col 0 is intercept)
350 let std_idx = j + 1;
351 let x_scale_predictor_j = if std_idx < standardization_info.x_scale.len() {
352 standardization_info.x_scale[std_idx]
353 } else {
354 1.0
355 };
356 let scale_factor = x_scale_predictor_j / y_scale;
357 (lower * scale_factor, upper * scale_factor)
358 }).collect()
359 });
360
361 let (iterations, converged) = coordinate_descent(
362 &x_standardized,
363 &y_standardized,
364 &mut coefficients_standardized,
365 lambda_standardized,
366 fit_options.alpha,
367 first_penalized_column_index,
368 fit_options.max_iter,
369 fit_options.tol,
370 fit_options.penalty_factor.as_deref(),
371 bounds_standardized.as_deref(),
372 &standardization_info.column_squared_norms,
373 )?;
374
375 // Unstandardize coefficients for output
376 let (intercept, beta_orig) = unstandardize_coefficients(&coefficients_standardized, &standardization_info);
377
378 // Count non-zeros
379 let n_nonzero = beta_orig.iter().filter(|&&b| b.abs() > 0.0).count();
380
381 // Fitted values & residuals
382 let fitted = predict(x, intercept, &beta_orig);
383 let residuals: Vec<f64> = y.iter().zip(&fitted).map(|(yi, yh)| yi - yh).collect();
384
385 // Statistics
386 let y_mean = y.iter().sum::<f64>() / n as f64;
387 let ss_tot: f64 = y.iter().map(|yi| (yi - y_mean).powi(2)).sum();
388 let ss_res: f64 = residuals.iter().map(|r| r.powi(2)).sum();
389 let mae: f64 = residuals.iter().map(|r| r.abs()).sum::<f64>() / n as f64;
390
391 let r_squared = if ss_tot > 1e-10 { 1.0 - ss_res / ss_tot } else { 1.0 };
392 let eff_df = 1.0 + n_nonzero as f64;
393 let adj_r_squared = if ss_tot > 1e-10 && n > eff_df as usize {
394 1.0 - (1.0 - r_squared) * ((n - 1) as f64 / (n as f64 - eff_df))
395 } else {
396 r_squared
397 };
398 let mse = ss_res / (n as f64 - eff_df).max(1.0);
399
400 // Model selection criteria
401 let ll = log_likelihood(n, mse, ss_res);
402 let n_coef = beta_orig.len() + 1; // coefficients + intercept
403 let aic_val = aic(ll, n_coef);
404 let bic_val = bic(ll, n_coef, n);
405
406 // Convert Internal lambda to Public (user-facing) lambda for reporting
407 // Public = Internal * y_scale_var * n (to match R's glmnet reporting)
408 let lambda_original_scale = lambda_standardized_value * lambda_conversion_factor;
409
410 fits.push(ElasticNetFit {
411 lambda: lambda_original_scale,
412 alpha: fit_options.alpha,
413 intercept,
414 coefficients: beta_orig,
415 fitted_values: fitted,
416 residuals,
417 n_nonzero,
418 iterations,
419 converged,
420 r_squared,
421 adj_r_squared,
422 mse,
423 rmse: mse.sqrt(),
424 mae,
425 log_likelihood: ll,
426 aic: aic_val,
427 bic: bic_val,
428 });
429 }
430
431 Ok(fits)
432}
433
434/// Fits elastic net regression for a single (lambda, alpha) pair.
435///
436/// Elastic net combines L1 (Lasso) and L2 (Ridge) penalties:
437/// - `alpha = 1.0` is pure Lasso (L1 only)
438/// - `alpha = 0.0` is pure Ridge (L2 only)
439/// - `alpha = 0.5` is an equal mix
440///
441/// # Arguments
442///
443/// * `x` - Design matrix (n rows × p columns including intercept)
444/// * `y` - Response variable (n observations)
445/// * `options` - Configuration options for elastic net regression
446///
447/// # Returns
448///
449/// An `ElasticNetFit` containing coefficients, convergence info, and metrics.
450///
451/// # Example
452///
453/// ```
454/// # use linreg_core::regularized::elastic_net::{elastic_net_fit, ElasticNetOptions};
455/// # use linreg_core::linalg::Matrix;
456/// let y = vec![2.0, 4.0, 6.0, 8.0];
457/// let x = Matrix::new(4, 2, vec![1.0, 1.0, 1.0, 2.0, 1.0, 3.0, 1.0, 4.0]);
458///
459/// // Elastic net with 50% L1, 50% L2
460/// let options = ElasticNetOptions {
461/// lambda: 0.1,
462/// alpha: 0.5,
463/// intercept: true,
464/// standardize: true,
465/// ..Default::default()
466/// };
467///
468/// let fit = elastic_net_fit(&x, &y, &options).unwrap();
469/// assert!(fit.converged);
470/// println!("R²: {}", fit.r_squared);
471/// # Ok::<(), linreg_core::Error>(())
472/// ```
473pub fn elastic_net_fit(x: &Matrix, y: &[f64], options: &ElasticNetOptions) -> Result<ElasticNetFit> {
474 if options.lambda < 0.0 {
475 return Err(Error::InvalidInput("Lambda must be non-negative".into()));
476 }
477 if options.alpha < 0.0 || options.alpha > 1.0 {
478 return Err(Error::InvalidInput("Alpha must be between 0 and 1".into()));
479 }
480
481 let n = x.rows;
482 let p = x.cols;
483
484 if y.len() != n {
485 return Err(Error::DimensionMismatch(format!(
486 "Length of y ({}) must match number of rows in X ({})",
487 y.len(),
488 n
489 )));
490 }
491
492 // Validate coefficient bounds
493 let n_predictors = if options.intercept { p - 1 } else { p };
494 if let Some(ref bounds) = options.coefficient_bounds {
495 if bounds.len() != n_predictors {
496 return Err(Error::InvalidInput(format!(
497 "Coefficient bounds length ({}) must match number of predictors ({})",
498 bounds.len(), n_predictors
499 )));
500 }
501 for (i, &(lower, upper)) in bounds.iter().enumerate() {
502 if lower > upper {
503 return Err(Error::InvalidInput(format!(
504 "Coefficient bounds for predictor {}: lower ({}) must be <= upper ({})",
505 i, lower, upper
506 )));
507 }
508 // Note: We allow (-inf, +inf) as it represents "no bounds" for that predictor
509 // This is useful for having mixed bounded/unbounded predictors
510 }
511 }
512
513 // Standardize X and y
514 // glmnet convention: y is always centered/scaled if intercept is present
515 let standardization_options = StandardizeOptions {
516 intercept: options.intercept,
517 standardize_x: options.standardize,
518 standardize_y: options.intercept,
519 weights: options.weights.clone(),
520 };
521
522 let (x_standardized, y_standardized, standardization_info) = standardize_xy(x, y, &standardization_options);
523
524 // Adjust lambda for scaling
525 // The path generation returns internal lambdas (for standardized data),
526 // which are used directly in coordinate descent.
527 //
528 // For single-lambda fits, the user provides "public" lambda values
529 // (like R reports), which need to be converted to "internal" scale:
530 // lambda_standardized_value = lambda_original_scale / y_scale
531 let y_scale_factor = standardization_info.y_scale.unwrap_or(1.0);
532 let lambda_standardized = if y_scale_factor > 1e-12 {
533 options.lambda / y_scale_factor
534 } else {
535 options.lambda
536 };
537
538 // DEBUG: Print scaling info
539 // #[cfg(debug_assertions)]
540 // {
541 // eprintln!("DEBUG elastic_net_fit: user_lambda = {}, y_scale = {}, lambda_standardized = {}",
542 // options.lambda, y_scale_factor, lambda_standardized);
543 // }
544
545 // Initial coefficients (all zeros)
546 let mut coefficients_standardized = vec![0.0; p];
547
548 // Determine unpenalized columns (e.g. intercept column 0 if manually added,
549 // but standardize_xy handles the intercept externally usually.
550 // If intercept=true, standardize_xy centers data and we don't penalize an implicit intercept.
551 // Here we assume x contains PREDICTORS only if intercept is handled by standardization centering.
552 // However, the `Matrix` struct might include a column of 1s if the user passed it.
553 // `standardize_xy` treats all columns in X as predictors to be standardized.
554 // If options.intercept is true, we compute the intercept from the means later.
555 // We assume X passed here does NOT contain a manual intercept column of 1s unless
556 // the user explicitly wants to penalize it (which is weird) or turned off intercept in options.
557 // For now, we penalize all columns in X according to penalty_factors.
558
559 // Check if we assume X has an intercept column at 0 that we should skip?
560 // The previous ridge/lasso implementations had a `first_penalized_column_index` logic:
561 // `let first_penalized_column_index = if options.intercept { 1 } else { 0 };`
562 // This implies `x` might have a column of 1s.
563 // GLMNET convention usually takes x matrix of predictors only.
564 // `standardize_xy` calculates means for ALL columns.
565 // If column 0 is all 1s, std dev is 0, standardization might fail or set to 0.
566 // Let's stick to the previous `lasso.rs` logic: if intercept is requested, we ignore column 0?
567 // `lasso.rs`: "Determine which columns are penalized. first_penalized_column_index = if options.intercept { 1 } else { 0 }"
568 // This strongly suggests the input Matrix `x` is expected to have a column of 1s at index 0 if intercept=true.
569 // We will preserve this behavior for compatibility with existing tests.
570 // i.e. this is going to be hell to refactor and I'm idly typing my thoughts away...
571 // This is a naive implementation anyways and only one head of the hydra that is glmnet.
572 let first_penalized_column_index = if options.intercept { 1 } else { 0 };
573
574 // Warm start initialization
575 if let Some(warm) = &options.warm_start {
576 // warm contains slope coefficients on ORIGINAL scale
577 // We need to transform them to STANDARDIZED scale
578 // coefficients_standardized = beta_orig * x_scale / y_scale
579 let y_scale = standardization_info.y_scale.unwrap_or(1.0);
580
581 if first_penalized_column_index == 1 {
582 // Case 1: Intercept at col 0
583 // warm start vector should correspond to cols 1..p (slopes)
584 // coefficients_standardized[0] stays 0.0 (intercept of centered data is 0)
585 if warm.len() == p - 1 {
586 for j in 1..p {
587 coefficients_standardized[j] = warm[j - 1] * standardization_info.x_scale[j] / y_scale;
588 }
589 } else {
590 // If dimensions don't match, ignore warm start or warn?
591 // For safety in this "todo" fix, we'll just ignore mismatched warm starts to avoid panics,
592 // but usually this indicates a caller error.
593 // Given I can't print warnings easily here, I'll ignore or maybe assume warm includes intercept?
594 // If warm has length p, maybe it includes intercept? But ElasticNetFit.coefficients excludes it.
595 // Let's stick to: warm start matches slopes.
596 }
597 } else {
598 // Case 2: No intercept column
599 if warm.len() == p {
600 for j in 0..p {
601 coefficients_standardized[j] = warm[j] * standardization_info.x_scale[j] / y_scale;
602 }
603 }
604 }
605 }
606
607 // Transform coefficient bounds to standardized scale
608 // Bounds on original scale need to be converted: coefficients_standardized = beta_orig * x_scale / y_scale
609 let bounds_standardized: Option<Vec<(f64, f64)>> = options.coefficient_bounds.as_ref().map(|bounds| {
610 let y_scale = standardization_info.y_scale.unwrap_or(1.0);
611 bounds.iter().enumerate().map(|(j, &(lower, upper))| {
612 // For each predictor j in original scale, the corresponding column
613 // in the standardized matrix is at index j+1 (col 0 is intercept)
614 let std_idx = j + 1;
615 let x_scale_predictor_j = if std_idx < standardization_info.x_scale.len() {
616 standardization_info.x_scale[std_idx]
617 } else {
618 1.0
619 };
620 let scale_factor = x_scale_predictor_j / y_scale;
621 (lower * scale_factor, upper * scale_factor)
622 }).collect()
623 });
624
625 let (iterations, converged) = coordinate_descent(
626 &x_standardized,
627 &y_standardized,
628 &mut coefficients_standardized,
629 lambda_standardized,
630 options.alpha,
631 first_penalized_column_index,
632 options.max_iter,
633 options.tol,
634 options.penalty_factor.as_deref(),
635 bounds_standardized.as_deref(),
636 &standardization_info.column_squared_norms,
637 )?;
638
639 // Unstandardize
640 let (intercept, beta_orig) = unstandardize_coefficients(&coefficients_standardized, &standardization_info);
641
642 // Count nonzero (excluding intercept)
643 // beta_orig contains slopes. If first_penalized_column_index=1, coefficients_standardized[0] was 0.
644 // The coefficients returned should correspond to the columns of X (excluding the manual intercept if present?).
645 // `unstandardize_coefficients` handles the mapping.
646 let n_nonzero = beta_orig.iter().filter(|&&b| b.abs() > 0.0).count();
647
648 // Fitted values
649 let fitted = predict(x, intercept, &beta_orig);
650 let residuals: Vec<f64> = y.iter().zip(&fitted).map(|(yi, yh)| yi - yh).collect();
651
652 // Statistics
653 let y_mean = y.iter().sum::<f64>() / n as f64;
654 let ss_tot: f64 = y.iter().map(|yi| (yi - y_mean).powi(2)).sum();
655 let ss_res: f64 = residuals.iter().map(|r| r.powi(2)).sum();
656 let mae: f64 = residuals.iter().map(|r| r.abs()).sum::<f64>() / n as f64;
657
658 let r_squared = if ss_tot > 1e-10 { 1.0 - ss_res / ss_tot } else { 1.0 };
659
660 // Effective DF approximation for Elastic Net
661 // df ≈ n_nonzero for Lasso
662 // df ≈ trace(S) for Ridge
663 // We use a naive approximation here: n_nonzero
664 let eff_df = 1.0 + n_nonzero as f64;
665 let adj_r_squared = if ss_tot > 1e-10 && n > eff_df as usize {
666 1.0 - (1.0 - r_squared) * ((n - 1) as f64 / (n as f64 - eff_df))
667 } else {
668 r_squared
669 };
670
671 let mse = ss_res / (n as f64 - eff_df).max(1.0);
672
673 // Model selection criteria
674 let ss_res: f64 = residuals.iter().map(|&r| r * r).sum();
675 let ll = log_likelihood(n, mse, ss_res);
676 let n_coef = beta_orig.len() + 1; // coefficients + intercept
677 let aic_val = aic(ll, n_coef);
678 let bic_val = bic(ll, n_coef, n);
679
680 Ok(ElasticNetFit {
681 lambda: options.lambda,
682 alpha: options.alpha,
683 intercept,
684 coefficients: beta_orig,
685 fitted_values: fitted,
686 residuals,
687 n_nonzero,
688 iterations,
689 converged,
690 r_squared,
691 adj_r_squared,
692 mse,
693 rmse: mse.sqrt(),
694 mae,
695 log_likelihood: ll,
696 aic: aic_val,
697 bic: bic_val,
698 })
699}
700
701#[allow(clippy::too_many_arguments)]
702#[allow(clippy::needless_range_loop)]
703fn coordinate_descent(
704 x: &Matrix,
705 y: &[f64],
706 beta: &mut [f64],
707 lambda: f64,
708 alpha: f64,
709 first_penalized_column_index: usize,
710 max_iter: usize,
711 tol: f64,
712 penalty_factor: Option<&[f64]>,
713 bounds: Option<&[(f64, f64)]>,
714 column_squared_norms: &[f64], // Column squared norms (for coordinate descent update)
715) -> Result<(usize, bool)> {
716 let n = x.rows;
717 let p = x.cols;
718
719 // Residuals r = y - Xβ
720 // Initialize with all betas zero -> residuals = y
721 // If y contains infinity/NaN, residuals will too
722 let mut residuals = y.to_vec();
723
724 // Check for non-finite residuals initially - if present, we can't optimize
725 if residuals.iter().any(|r| !r.is_finite()) {
726 return Ok((0, false));
727 }
728
729 // Handle non-zero initial betas (warm starts)
730 for j in 0..p {
731 if beta[j] != 0.0 {
732 for i in 0..n {
733 residuals[i] -= x.get(i, j) * beta[j];
734 }
735 }
736 }
737
738 // Active set: indices of non-zero coefficients
739 let mut active_set = vec![false; p];
740
741 let mut converged = false;
742 let mut iter = 0;
743
744 while iter < max_iter {
745 let mut maximum_coefficient_change = 0.0;
746
747 // --- Full Pass ---
748 for j in first_penalized_column_index..p {
749 if update_feature(j, x, &mut residuals, beta, lambda, alpha, penalty_factor, bounds, column_squared_norms, &mut maximum_coefficient_change) {
750 active_set[j] = true;
751 }
752 }
753 iter += 1;
754
755 if maximum_coefficient_change < tol {
756 converged = true;
757 break;
758 }
759
760 // --- Active Set Loop ---
761 loop {
762 if iter >= max_iter { break; }
763
764 let mut active_set_coefficient_change = 0.0;
765 let mut active_count = 0;
766
767 for j in first_penalized_column_index..p {
768 if active_set[j] {
769 update_feature(j, x, &mut residuals, beta, lambda, alpha, penalty_factor, bounds, column_squared_norms, &mut active_set_coefficient_change);
770 active_count += 1;
771
772 if beta[j] == 0.0 {
773 active_set[j] = false;
774 }
775 }
776 }
777
778 iter += 1;
779
780 if active_set_coefficient_change < tol {
781 break;
782 }
783
784 if active_count == 0 {
785 break;
786 }
787 }
788 }
789
790 Ok((iter, converged))
791}
792
793#[inline]
794#[allow(clippy::too_many_arguments)]
795#[allow(clippy::needless_range_loop)]
796fn update_feature(
797 j: usize,
798 x: &Matrix,
799 residuals: &mut [f64],
800 beta: &mut [f64],
801 lambda: f64,
802 alpha: f64,
803 penalty_factor: Option<&[f64]>,
804 bounds: Option<&[(f64, f64)]>,
805 column_squared_norms: &[f64], // Column squared norms (for coordinate descent update)
806 maximum_coefficient_change: &mut f64
807) -> bool {
808 // Penalty factor
809 let penalty_factor_value = penalty_factor.and_then(|v| v.get(j)).copied().unwrap_or(1.0);
810 if penalty_factor_value == f64::INFINITY {
811 beta[j] = 0.0;
812 return false;
813 }
814
815 let n = x.rows;
816 let coefficient_previous = beta[j];
817
818 // Calculate partial residual correlation (rho)
819 // residuals currently = y - Sum(Xk * beta_k)
820 // We want r_partial = y - Sum_{k!=j}(Xk * beta_k) = residuals + Xj * beta_j
821 // rho = Xj^T * r_partial = Xj^T * residuals + (Xj^T * Xj) * beta_j
822 // where Xj^T * Xj = column_squared_norms[j] (the squared norm of column j after standardization)
823
824 let mut partial_correlation_unscaled = 0.0;
825 for i in 0..n {
826 partial_correlation_unscaled += x.get(i, j) * residuals[i];
827 }
828 // Use column_squared_norms[j] instead of assuming 1.0
829 let rho = partial_correlation_unscaled + column_squared_norms[j] * coefficient_previous;
830
831 // Soft thresholding
832 // Numerator: S(rho, lambda * alpha * penalty_factor_value)
833 let threshold = lambda * alpha * penalty_factor_value;
834 let soft_threshold_result = soft_threshold(rho, threshold);
835
836 // Denominator
837 // Elastic net denominator: column_squared_norms[j] + lambda * (1 - alpha) * penalty_factor_value
838 // This matches glmnet's formula
839 let denominator_with_ridge_penalty = column_squared_norms[j] + lambda * (1.0 - alpha) * penalty_factor_value;
840
841 let mut coefficient_updated = soft_threshold_result / denominator_with_ridge_penalty;
842
843 // Apply coefficient bounds (clamping) if provided
844 // Bounds clamp the calculated value to [lower, upper]
845 if let Some(bounds) = bounds {
846 // bounds[j-1] because bounds is indexed by predictor (excluding intercept)
847 // and j starts at first_penalized_column_index (usually 1 for intercept models)
848 let bounds_idx = j.saturating_sub(1);
849 if let Some((lower, upper)) = bounds.get(bounds_idx) {
850 coefficient_updated = coefficient_updated.max(*lower).min(*upper);
851 }
852 }
853
854 // Update residuals if beta changed
855 if coefficient_updated != coefficient_previous {
856 let coefficient_change = coefficient_updated - coefficient_previous;
857 for i in 0..n {
858 // residuals_new = residuals_old - x_j * coefficient_change
859 residuals[i] -= x.get(i, j) * coefficient_change;
860 }
861 beta[j] = coefficient_updated;
862 *maximum_coefficient_change = maximum_coefficient_change.max(coefficient_change.abs());
863 true // changed
864 } else {
865 false // no change
866 }
867}
868
869// ============================================================================
870// Model Serialization Traits
871// ============================================================================
872
873// Generate ModelSave and ModelLoad implementations using macro
874impl_serialization!(ElasticNetFit, ModelType::ElasticNet, "ElasticNet");
875
876#[cfg(test)]
877mod tests {
878 use super::*;
879
880 #[test]
881 fn test_soft_threshold_basic_cases() {
882 // Test soft_threshold function edge cases
883 assert_eq!(soft_threshold(5.0, 2.0), 3.0); // z > gamma
884 assert_eq!(soft_threshold(-5.0, 2.0), -3.0); // z < -gamma
885 assert_eq!(soft_threshold(1.0, 2.0), 0.0); // |z| <= gamma
886 assert_eq!(soft_threshold(2.0, 2.0), 0.0); // z == gamma
887 assert_eq!(soft_threshold(-2.0, 2.0), 0.0); // z == -gamma
888 }
889
890 #[test]
891 fn test_soft_threshold_zero() {
892 assert_eq!(soft_threshold(0.0, 0.0), 0.0);
893 assert_eq!(soft_threshold(5.0, 0.0), 5.0);
894 assert_eq!(soft_threshold(-5.0, 0.0), -5.0);
895 }
896
897 #[test]
898 #[should_panic(expected = "Soft threshold gamma must be non-negative")]
899 fn test_soft_threshold_negative_gamma_panics() {
900 soft_threshold(1.0, -1.0);
901 }
902
903 #[test]
904 fn test_elastic_net_options_default() {
905 let options = ElasticNetOptions::default();
906 assert_eq!(options.lambda, 1.0);
907 assert_eq!(options.alpha, 1.0); // Default is 1.0 (Lasso)
908 assert!(options.intercept);
909 assert!(options.standardize);
910 assert_eq!(options.max_iter, 100000);
911 assert_eq!(options.tol, 1e-7);
912 assert!(options.penalty_factor.is_none());
913 assert!(options.warm_start.is_none());
914 assert!(options.coefficient_bounds.is_none());
915 }
916
917 #[test]
918 fn test_elastic_net_fit_simple() {
919 // Simple linear relationship: y = 2*x + 1
920 let y = vec![3.0, 5.0, 7.0, 9.0, 11.0];
921 let x1: Vec<f64> = (1..=5).map(|i| i as f64).collect();
922
923 // Build matrix with intercept column
924 let n = 5;
925 let p = 1;
926 let mut x_data = vec![1.0; n * (p + 1)]; // Start with all 1s for intercept
927 for i in 0..n {
928 x_data[i * (p + 1) + 1] = x1[i]; // Fill in predictor column
929 }
930 let x = Matrix::new(n, p + 1, x_data);
931
932 let options = ElasticNetOptions {
933 lambda: 0.01, // Small lambda for minimal regularization
934 alpha: 0.5,
935 intercept: true,
936 standardize: true,
937 ..Default::default()
938 };
939
940 let result = elastic_net_fit(&x, &y, &options);
941 assert!(result.is_ok());
942
943 let fit = result.unwrap();
944 assert!(fit.converged);
945 // Coefficients should be close to [1, 2] (intercept, slope)
946 assert!((fit.intercept - 1.0).abs() < 0.5);
947 assert!((fit.coefficients[0] - 2.0).abs() < 0.5);
948 }
949
950 #[test]
951 fn test_elastic_net_fit_with_penalty_factor() {
952 let y = vec![3.0, 5.0, 7.0, 9.0, 11.0];
953 let x1: Vec<f64> = (1..=5).map(|i| i as f64).collect();
954
955 let n = 5;
956 let p = 1;
957 let mut x_data = vec![1.0; n * (p + 1)];
958 for i in 0..n {
959 x_data[i * (p + 1) + 1] = x1[i];
960 }
961 let x = Matrix::new(n, p + 1, x_data);
962
963 let options = ElasticNetOptions {
964 lambda: 0.1,
965 alpha: 0.5,
966 penalty_factor: Some(vec![1.0]),
967 intercept: true,
968 standardize: true,
969 ..Default::default()
970 };
971
972 let result = elastic_net_fit(&x, &y, &options);
973 assert!(result.is_ok());
974 }
975
976 #[test]
977 fn test_elastic_net_fit_with_coefficient_bounds() {
978 let y = vec![3.0, 5.0, 7.0, 9.0, 11.0];
979 let x1: Vec<f64> = (1..=5).map(|i| i as f64).collect();
980
981 let n = 5;
982 let p = 1;
983 let mut x_data = vec![1.0; n * (p + 1)];
984 for i in 0..n {
985 x_data[i * (p + 1) + 1] = x1[i];
986 }
987 let x = Matrix::new(n, p + 1, x_data);
988
989 let options = ElasticNetOptions {
990 lambda: 0.01,
991 alpha: 0.5,
992 coefficient_bounds: Some(vec![(0.0, 3.0)]), // Bound slope to [0, 3]
993 intercept: true,
994 standardize: true,
995 ..Default::default()
996 };
997
998 let result = elastic_net_fit(&x, &y, &options);
999 assert!(result.is_ok());
1000
1001 let fit = result.unwrap();
1002 // Coefficient should be within bounds
1003 assert!(fit.coefficients[0] >= 0.0);
1004 assert!(fit.coefficients[0] <= 3.0);
1005 }
1006
1007 #[test]
1008 fn test_elastic_net_pure_lasso() {
1009 // alpha = 1.0 means pure Lasso
1010 let y = vec![3.0, 5.0, 7.0, 9.0, 11.0];
1011 let x1: Vec<f64> = (1..=5).map(|i| i as f64).collect();
1012
1013 let n = 5;
1014 let p = 1;
1015 let mut x_data = vec![1.0; n * (p + 1)];
1016 for i in 0..n {
1017 x_data[i * (p + 1) + 1] = x1[i];
1018 }
1019 let x = Matrix::new(n, p + 1, x_data);
1020
1021 let options = ElasticNetOptions {
1022 lambda: 1.0,
1023 alpha: 1.0, // Pure Lasso
1024 intercept: true,
1025 standardize: true,
1026 ..Default::default()
1027 };
1028
1029 let result = elastic_net_fit(&x, &y, &options);
1030 assert!(result.is_ok());
1031 }
1032
1033 #[test]
1034 fn test_elastic_net_pure_ridge() {
1035 // alpha = 0.0 means pure Ridge
1036 let y = vec![3.0, 5.0, 7.0, 9.0, 11.0];
1037 let x1: Vec<f64> = (1..=5).map(|i| i as f64).collect();
1038
1039 let n = 5;
1040 let p = 1;
1041 let mut x_data = vec![1.0; n * (p + 1)];
1042 for i in 0..n {
1043 x_data[i * (p + 1) + 1] = x1[i];
1044 }
1045 let x = Matrix::new(n, p + 1, x_data);
1046
1047 let options = ElasticNetOptions {
1048 lambda: 0.1,
1049 alpha: 0.0, // Pure Ridge
1050 intercept: true,
1051 standardize: true,
1052 ..Default::default()
1053 };
1054
1055 let result = elastic_net_fit(&x, &y, &options);
1056 assert!(result.is_ok());
1057
1058 let fit = result.unwrap();
1059 // Ridge shouldn't zero out coefficients
1060 assert!(fit.n_nonzero >= 1);
1061 }
1062
1063 #[test]
1064 fn test_elastic_fit_no_intercept() {
1065 let y = vec![1.0, 2.0, 3.0, 4.0, 5.0];
1066 let x1: Vec<f64> = (1..=5).map(|i| i as f64).collect();
1067
1068 let n = 5;
1069 let p = 1;
1070 let x = Matrix::new(n, p, x1); // No intercept column
1071
1072 let options = ElasticNetOptions {
1073 lambda: 0.01,
1074 alpha: 0.5,
1075 intercept: false, // No intercept
1076 standardize: true,
1077 ..Default::default()
1078 };
1079
1080 let result = elastic_net_fit(&x, &y, &options);
1081 assert!(result.is_ok());
1082 }
1083
1084 #[test]
1085 fn test_elastic_net_with_warm_start() {
1086 let y = vec![3.0, 5.0, 7.0, 9.0, 11.0];
1087 let x1: Vec<f64> = (1..=5).map(|i| i as f64).collect();
1088
1089 let n = 5;
1090 let p = 1;
1091 let mut x_data = vec![1.0; n * (p + 1)];
1092 for i in 0..n {
1093 x_data[i * (p + 1) + 1] = x1[i];
1094 }
1095 let x = Matrix::new(n, p + 1, x_data);
1096
1097 let warm = vec![1.5];
1098
1099 let options = ElasticNetOptions {
1100 lambda: 0.1,
1101 alpha: 0.5,
1102 intercept: true,
1103 standardize: true,
1104 warm_start: Some(warm),
1105 ..Default::default()
1106 };
1107
1108 let result = elastic_net_fit(&x, &y, &options);
1109 assert!(result.is_ok());
1110 }
1111
1112 #[test]
1113 fn test_elastic_net_multivariate() {
1114 // Multiple predictors
1115 let y = vec![3.0, 5.0, 7.0, 9.0, 11.0];
1116 let x1: Vec<f64> = (1..=5).map(|i| i as f64).collect();
1117 let x2 = vec![2.0, 4.0, 5.0, 4.0, 3.0];
1118
1119 let n = 5;
1120 let p = 2;
1121 let mut x_data = vec![1.0; n * (p + 1)]; // Intercept column
1122 for i in 0..n {
1123 x_data[i * (p + 1) + 1] = x1[i];
1124 x_data[i * (p + 1) + 2] = x2[i];
1125 }
1126 let x = Matrix::new(n, p + 1, x_data);
1127
1128 let options = ElasticNetOptions {
1129 lambda: 0.1,
1130 alpha: 0.5,
1131 intercept: true,
1132 standardize: true,
1133 ..Default::default()
1134 };
1135
1136 let result = elastic_net_fit(&x, &y, &options);
1137 assert!(result.is_ok());
1138
1139 let fit = result.unwrap();
1140 assert_eq!(fit.coefficients.len(), 2); // Two predictors
1141 }
1142}