Skip to main content

fdars_core/scalar_on_function/
fregre_lm.rs

1use super::{
2    build_design_matrix, cholesky_factor, compute_beta_se, compute_fitted, compute_ols_std_errors,
3    compute_r_squared, compute_xtx, ols_solve, recover_beta_t, resolve_ncomp,
4    validate_fregre_inputs, FregreCvResult, FregreLmResult, ModelSelectionResult,
5    SelectionCriterion,
6};
7use crate::cv::create_folds;
8use crate::error::FdarError;
9use crate::matrix::FdMatrix;
10use crate::regression::fdata_to_pc_1d;
11
12// ---------------------------------------------------------------------------
13// fregre_lm: FPC-based functional linear model
14// ---------------------------------------------------------------------------
15
16/// Functional linear model with optional scalar covariates.
17///
18/// Fits the model: `y = α + Σ_k γ_k ξ_k + γ_z' z + ε`
19/// where ξ_k are FPC scores of the functional predictor X(t) and z are scalar covariates.
20/// The functional coefficient is recovered as `β(t) = Σ_k γ_k φ_k(t)`.
21///
22/// # Arguments
23/// * `data` - Functional predictor matrix (n × m)
24/// * `y` - Scalar response vector (length n)
25/// * `scalar_covariates` - Optional scalar covariates matrix (n × p), `None` for pure functional model
26/// * `ncomp` - Number of FPC components (if 0, selected by GCV)
27///
28/// # Errors
29///
30/// Returns [`FdarError::InvalidDimension`] if `data` has fewer than 3 rows,
31/// zero columns, `y.len() != n`, or `scalar_covariates` row count differs from `n`.
32/// Returns [`FdarError::InvalidParameter`] if auto-selected `ncomp` via
33/// [`fregre_cv`] fails due to invalid range, or if `ncomp` passed to FPCA is zero.
34/// Returns [`FdarError::ComputationFailed`] if the SVD inside FPCA fails to
35/// extract components, or if the OLS normal equations (X'X) are singular.
36///
37/// # Returns
38/// [`FregreLmResult`] with estimated coefficients, fitted values, and diagnostics
39///
40/// # Examples
41///
42/// ```
43/// use fdars_core::matrix::FdMatrix;
44/// use fdars_core::scalar_on_function::fregre_lm;
45///
46/// // 20 curves at 30 evaluation points (each curve has a different frequency)
47/// let (n, m) = (20, 30);
48/// let data = FdMatrix::from_column_major(
49///     (0..n * m).map(|k| {
50///         let i = (k % n) as f64;
51///         let j = (k / n) as f64;
52///         ((i + 1.0) * j * 0.2).sin()
53///     }).collect(),
54///     n, m,
55/// ).unwrap();
56/// let y: Vec<f64> = (0..n).map(|i| (i as f64 * 0.5).sin()).collect();
57/// let fit = fregre_lm(&data, &y, None, 3).unwrap();
58/// assert_eq!(fit.fitted_values.len(), 20);
59/// assert_eq!(fit.beta_t.len(), 30);
60/// assert!(fit.r_squared >= 0.0 && fit.r_squared <= 1.0);
61/// ```
62#[must_use = "expensive computation whose result should not be discarded"]
63pub fn fregre_lm(
64    data: &FdMatrix,
65    y: &[f64],
66    scalar_covariates: Option<&FdMatrix>,
67    ncomp: usize,
68) -> Result<FregreLmResult, FdarError> {
69    let (n, m) = data.shape();
70    validate_fregre_inputs(n, m, y, scalar_covariates)?;
71
72    let ncomp = resolve_ncomp(ncomp, data, y, scalar_covariates, n, m)?;
73
74    let argvals: Vec<f64> = (0..m).map(|j| j as f64 / (m - 1).max(1) as f64).collect();
75    let fpca = fdata_to_pc_1d(data, ncomp, &argvals)?;
76    let design = build_design_matrix(&fpca.scores, ncomp, scalar_covariates, n);
77    let p_total = design.ncols();
78    let (coeffs, hat_diag) = ols_solve(&design, y)?;
79
80    let fitted_values = compute_fitted(&design, &coeffs);
81    let residuals: Vec<f64> = y
82        .iter()
83        .zip(&fitted_values)
84        .map(|(&yi, &yh)| yi - yh)
85        .collect();
86    let (r_squared, r_squared_adj) = compute_r_squared(y, &residuals, p_total);
87
88    let ss_res: f64 = residuals.iter().map(|r| r * r).sum();
89    let df_resid = (n as f64 - p_total as f64).max(1.0);
90    let residual_se = (ss_res / df_resid).sqrt();
91    let sigma2 = ss_res / df_resid;
92
93    let xtx = compute_xtx(&design);
94    let l = cholesky_factor(&xtx, p_total).unwrap_or_else(|_| vec![1.0; p_total * p_total]);
95    let std_errors = compute_ols_std_errors(&l, p_total, sigma2);
96
97    let gcv = hat_diag
98        .iter()
99        .zip(&residuals)
100        .map(|(&h, &r)| (r / (1.0 - h).max(1e-10)).powi(2))
101        .sum::<f64>()
102        / n as f64;
103
104    let beta_t = recover_beta_t(&coeffs[1..=ncomp], &fpca.rotation, m);
105    let beta_se = compute_beta_se(&std_errors[1..=ncomp], &fpca.rotation, m);
106    let gamma: Vec<f64> = coeffs[1 + ncomp..].to_vec();
107
108    let nf = n as f64;
109    let rss = ss_res;
110    let aic = nf * (rss / nf).ln() + 2.0 * p_total as f64;
111    let bic = nf * (rss / nf).ln() + (nf).ln() * p_total as f64;
112
113    Ok(FregreLmResult {
114        intercept: coeffs[0],
115        beta_t,
116        beta_se,
117        gamma,
118        fitted_values,
119        residuals,
120        r_squared,
121        r_squared_adj,
122        std_errors,
123        ncomp,
124        fpca,
125        coefficients: coeffs,
126        residual_se,
127        gcv,
128        aic,
129        bic,
130    })
131}
132
133// ---------------------------------------------------------------------------
134// fregre_cv: Cross-validation for K selection
135// ---------------------------------------------------------------------------
136
137/// Copy a subset of rows from src into dst.
138fn copy_rows(dst: &mut FdMatrix, src: &FdMatrix, src_rows: &[usize]) {
139    let ncols = src.ncols();
140    for (dst_i, &src_i) in src_rows.iter().enumerate() {
141        for j in 0..ncols {
142            dst[(dst_i, j)] = src[(src_i, j)];
143        }
144    }
145}
146
147/// Compute CV error for a single K across all folds (randomized assignment).
148fn cv_error_for_k(
149    data: &FdMatrix,
150    y: &[f64],
151    scalar_covariates: Option<&FdMatrix>,
152    k: usize,
153    n_folds: usize,
154    folds: &[usize],
155) -> Result<f64, FdarError> {
156    let n = data.nrows();
157    let ncols = data.ncols();
158    let mut total_error = 0.0;
159    let mut count = 0;
160
161    for fold in 0..n_folds {
162        let train_idx: Vec<usize> = (0..n).filter(|&i| folds[i] != fold).collect();
163        let test_idx: Vec<usize> = (0..n).filter(|&i| folds[i] == fold).collect();
164        let n_test = test_idx.len();
165        if n_test == 0 || train_idx.len() < k + 2 {
166            continue;
167        }
168
169        let mut train_data = FdMatrix::zeros(train_idx.len(), ncols);
170        let mut test_data = FdMatrix::zeros(n_test, ncols);
171        copy_rows(&mut train_data, data, &train_idx);
172        copy_rows(&mut test_data, data, &test_idx);
173
174        let train_y: Vec<f64> = train_idx.iter().map(|&i| y[i]).collect();
175        let test_y: Vec<f64> = test_idx.iter().map(|&i| y[i]).collect();
176
177        let train_sc = scalar_covariates.map(|sc| {
178            let mut m = FdMatrix::zeros(train_idx.len(), sc.ncols());
179            copy_rows(&mut m, sc, &train_idx);
180            m
181        });
182        let test_sc = scalar_covariates.map(|sc| {
183            let mut m = FdMatrix::zeros(n_test, sc.ncols());
184            copy_rows(&mut m, sc, &test_idx);
185            m
186        });
187
188        let Ok(fit) = fregre_lm(&train_data, &train_y, train_sc.as_ref(), k) else {
189            continue;
190        };
191
192        let predictions = predict_fregre_lm(&fit, &test_data, test_sc.as_ref());
193        let fold_mse: f64 = predictions
194            .iter()
195            .zip(&test_y)
196            .map(|(&yhat, &yi)| (yhat - yi).powi(2))
197            .sum::<f64>()
198            / n_test as f64;
199
200        total_error += fold_mse * n_test as f64;
201        count += n_test;
202    }
203
204    if count > 0 {
205        Ok(total_error / count as f64)
206    } else {
207        Err(FdarError::ComputationFailed {
208            operation: "CV error computation",
209            detail: "no valid folds produced predictions; try reducing ncomp or increasing the number of observations".to_string(),
210        })
211    }
212}
213
214/// K-fold cross-validation for selecting the number of FPC components.
215///
216/// # Arguments
217/// * `data` - Functional predictor matrix (n × m)
218/// * `y` - Scalar response vector (length n)
219/// * `scalar_covariates` - Optional scalar covariates matrix
220/// * `k_min` - Minimum number of components to test
221/// * `k_max` - Maximum number of components to test
222/// * `n_folds` - Number of CV folds
223///
224/// # Errors
225///
226/// Returns [`FdarError::InvalidDimension`] if `data` has fewer rows than `n_folds`.
227/// Returns [`FdarError::InvalidParameter`] if `k_min < 1` or `k_min > k_max`.
228/// Returns [`FdarError::ComputationFailed`] if no candidate K value produces a
229/// valid CV error across all folds.
230#[must_use = "expensive computation whose result should not be discarded"]
231pub fn fregre_cv(
232    data: &FdMatrix,
233    y: &[f64],
234    scalar_covariates: Option<&FdMatrix>,
235    k_min: usize,
236    k_max: usize,
237    n_folds: usize,
238) -> Result<FregreCvResult, FdarError> {
239    let n = data.nrows();
240    if n < n_folds {
241        return Err(FdarError::InvalidDimension {
242            parameter: "data",
243            expected: format!("at least {n_folds} rows (n_folds)"),
244            actual: format!("{n}"),
245        });
246    }
247    if k_min < 1 || k_min > k_max {
248        return Err(FdarError::InvalidParameter {
249            parameter: "k_min/k_max",
250            message: format!("k_min={k_min} must be >= 1 and <= k_max={k_max}"),
251        });
252    }
253
254    let k_max = k_max.min(n - 2).min(data.ncols());
255
256    // Use randomized fold assignment (consistent seed for reproducibility)
257    let folds = create_folds(n, n_folds, 42);
258
259    let mut k_values = Vec::new();
260    let mut cv_errors = Vec::new();
261
262    for k in k_min..=k_max {
263        if let Ok(err) = cv_error_for_k(data, y, scalar_covariates, k, n_folds, &folds) {
264            k_values.push(k);
265            cv_errors.push(err);
266        }
267    }
268
269    if k_values.is_empty() {
270        return Err(FdarError::ComputationFailed {
271            operation: "fregre_cv",
272            detail: "no valid K values produced CV errors; all candidate ncomp values failed — check data for zero-variance columns or increase n".to_string(),
273        });
274    }
275
276    let (optimal_idx, &min_cv_error) = cv_errors
277        .iter()
278        .enumerate()
279        .min_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal))
280        .expect("checked: cv_errors is non-empty");
281
282    let optimal_k = k_values[optimal_idx];
283
284    // Collect OOF predictions at optimal K
285    let ncols = data.ncols();
286    let mut oof_predictions = vec![f64::NAN; n];
287    let mut fold_errors = Vec::with_capacity(n_folds);
288
289    for fold in 0..n_folds {
290        let train_idx: Vec<usize> = (0..n).filter(|&i| folds[i] != fold).collect();
291        let test_idx: Vec<usize> = (0..n).filter(|&i| folds[i] == fold).collect();
292        let n_test = test_idx.len();
293        if n_test == 0 || train_idx.len() < optimal_k + 2 {
294            fold_errors.push(f64::NAN);
295            continue;
296        }
297
298        let mut train_data = FdMatrix::zeros(train_idx.len(), ncols);
299        let mut test_data = FdMatrix::zeros(n_test, ncols);
300        copy_rows(&mut train_data, data, &train_idx);
301        copy_rows(&mut test_data, data, &test_idx);
302
303        let train_y: Vec<f64> = train_idx.iter().map(|&i| y[i]).collect();
304
305        let train_sc = scalar_covariates.map(|sc| {
306            let mut m = FdMatrix::zeros(train_idx.len(), sc.ncols());
307            copy_rows(&mut m, sc, &train_idx);
308            m
309        });
310        let test_sc = scalar_covariates.map(|sc| {
311            let mut m = FdMatrix::zeros(n_test, sc.ncols());
312            copy_rows(&mut m, sc, &test_idx);
313            m
314        });
315
316        if let Ok(fit) = fregre_lm(&train_data, &train_y, train_sc.as_ref(), optimal_k) {
317            let preds = predict_fregre_lm(&fit, &test_data, test_sc.as_ref());
318            let mut fold_mse = 0.0;
319            for (ti, &i) in test_idx.iter().enumerate() {
320                oof_predictions[i] = preds[ti];
321                fold_mse += (preds[ti] - y[i]).powi(2);
322            }
323            fold_errors.push(fold_mse / n_test as f64);
324        } else {
325            fold_errors.push(f64::NAN);
326        }
327    }
328
329    Ok(FregreCvResult {
330        k_values: k_values.clone(),
331        cv_errors,
332        optimal_k,
333        min_cv_error,
334        oof_predictions,
335        fold_assignments: folds,
336        fold_errors,
337    })
338}
339
340// ---------------------------------------------------------------------------
341// Model selection
342// ---------------------------------------------------------------------------
343
344/// Select optimal ncomp for `fregre_lm` using AIC, BIC, or GCV.
345///
346/// Fits models for `ncomp = 1..=max_ncomp`, collects information criteria,
347/// and returns the best ncomp under the chosen criterion.
348///
349/// # Arguments
350/// * `data` — Functional predictor matrix (n × m)
351/// * `y` — Scalar responses (length n)
352/// * `scalar_covariates` — Optional scalar covariates (n × p)
353/// * `max_ncomp` — Maximum number of FPC components to try
354/// * `criterion` — Which criterion to minimise
355///
356/// # Errors
357///
358/// Returns [`FdarError::InvalidParameter`] if `max_ncomp` is zero.
359/// Returns [`FdarError::ComputationFailed`] if no candidate ncomp produces a
360/// valid fitted model (all calls to [`fregre_lm`] fail).
361#[must_use = "expensive computation whose result should not be discarded"]
362pub fn model_selection_ncomp(
363    data: &FdMatrix,
364    y: &[f64],
365    scalar_covariates: Option<&FdMatrix>,
366    max_ncomp: usize,
367    criterion: SelectionCriterion,
368) -> Result<ModelSelectionResult, FdarError> {
369    if max_ncomp == 0 {
370        return Err(FdarError::InvalidParameter {
371            parameter: "max_ncomp",
372            message: "must be >= 1".to_string(),
373        });
374    }
375
376    let mut criteria = Vec::with_capacity(max_ncomp);
377
378    for k in 1..=max_ncomp {
379        if let Ok(fit) = fregre_lm(data, y, scalar_covariates, k) {
380            criteria.push((k, fit.aic, fit.bic, fit.gcv));
381        }
382    }
383
384    if criteria.is_empty() {
385        return Err(FdarError::ComputationFailed {
386            operation: "model_selection_ncomp",
387            detail: "no valid models could be fitted; all candidate ncomp values failed — check data for degeneracy or reduce the range".to_string(),
388        });
389    }
390
391    let best_idx = match criterion {
392        SelectionCriterion::Aic => criteria
393            .iter()
394            .enumerate()
395            .min_by(|(_, a), (_, b)| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal))
396            .map_or(0, |(i, _)| i),
397        SelectionCriterion::Bic => criteria
398            .iter()
399            .enumerate()
400            .min_by(|(_, a), (_, b)| a.2.partial_cmp(&b.2).unwrap_or(std::cmp::Ordering::Equal))
401            .map_or(0, |(i, _)| i),
402        SelectionCriterion::Gcv => criteria
403            .iter()
404            .enumerate()
405            .min_by(|(_, a), (_, b)| a.3.partial_cmp(&b.3).unwrap_or(std::cmp::Ordering::Equal))
406            .map_or(0, |(i, _)| i),
407    };
408
409    Ok(ModelSelectionResult {
410        best_ncomp: criteria[best_idx].0,
411        criteria,
412    })
413}
414
415/// Predict new responses using a fitted functional linear model.
416///
417/// # Arguments
418/// * `fit` - A fitted [`FregreLmResult`]
419/// * `new_data` - New functional predictor matrix (n_new × m)
420/// * `new_scalar` - Optional new scalar covariates (n_new × p)
421///
422/// # Examples
423///
424/// ```
425/// use fdars_core::matrix::FdMatrix;
426/// use fdars_core::scalar_on_function::{fregre_lm, predict_fregre_lm};
427///
428/// let (n, m) = (20, 30);
429/// let data = FdMatrix::from_column_major(
430///     (0..n * m).map(|k| {
431///         let i = (k % n) as f64;
432///         let j = (k / n) as f64;
433///         ((i + 1.0) * j * 0.2).sin()
434///     }).collect(),
435///     n, m,
436/// ).unwrap();
437/// let y: Vec<f64> = (0..n).map(|i| (i as f64 * 0.5).sin()).collect();
438/// let fit = fregre_lm(&data, &y, None, 3).unwrap();
439///
440/// // Predict on same data
441/// let predictions = predict_fregre_lm(&fit, &data, None);
442/// assert_eq!(predictions.len(), 20);
443/// ```
444pub fn predict_fregre_lm(
445    fit: &FregreLmResult,
446    new_data: &FdMatrix,
447    new_scalar: Option<&FdMatrix>,
448) -> Vec<f64> {
449    let (n_new, m) = new_data.shape();
450    let ncomp = fit.ncomp;
451    let p_scalar = fit.gamma.len();
452
453    let mut predictions = vec![0.0; n_new];
454    for i in 0..n_new {
455        let mut yhat = fit.intercept;
456        for k in 0..ncomp {
457            let mut s = 0.0;
458            for j in 0..m {
459                s += (new_data[(i, j)] - fit.fpca.mean[j])
460                    * fit.fpca.rotation[(j, k)]
461                    * fit.fpca.weights[j];
462            }
463            yhat += fit.coefficients[1 + k] * s;
464        }
465        if let Some(sc) = new_scalar {
466            for j in 0..p_scalar {
467                yhat += fit.gamma[j] * sc[(i, j)];
468            }
469        }
470        predictions[i] = yhat;
471    }
472    predictions
473}