Skip to main content

fdars_core/scalar_on_function/
fregre_lm.rs

1use super::{
2    build_design_matrix, cholesky_factor, compute_beta_se, compute_fitted, compute_ols_std_errors,
3    compute_r_squared, compute_xtx, ols_solve, recover_beta_t, resolve_ncomp,
4    validate_fregre_inputs, FregreCvResult, FregreLmResult, ModelSelectionResult,
5    SelectionCriterion,
6};
7use crate::cv::create_folds;
8use crate::error::FdarError;
9use crate::matrix::FdMatrix;
10use crate::regression::fdata_to_pc_1d;
11
12// ---------------------------------------------------------------------------
13// fregre_lm: FPC-based functional linear model
14// ---------------------------------------------------------------------------
15
16/// Functional linear model with optional scalar covariates.
17///
18/// Fits the model: `y = α + Σ_k γ_k ξ_k + γ_z' z + ε`
19/// where ξ_k are FPC scores of the functional predictor X(t) and z are scalar covariates.
20/// The functional coefficient is recovered as `β(t) = Σ_k γ_k φ_k(t)`.
21///
22/// # Arguments
23/// * `data` - Functional predictor matrix (n × m)
24/// * `y` - Scalar response vector (length n)
25/// * `scalar_covariates` - Optional scalar covariates matrix (n × p), `None` for pure functional model
26/// * `ncomp` - Number of FPC components (if 0, selected by GCV)
27///
28/// # Errors
29///
30/// Returns [`FdarError::InvalidDimension`] if `data` has fewer than 3 rows,
31/// zero columns, `y.len() != n`, or `scalar_covariates` row count differs from `n`.
32/// Returns [`FdarError::InvalidParameter`] if auto-selected `ncomp` via
33/// [`fregre_cv`] fails due to invalid range, or if `ncomp` passed to FPCA is zero.
34/// Returns [`FdarError::ComputationFailed`] if the SVD inside FPCA fails to
35/// extract components, or if the OLS normal equations (X'X) are singular.
36///
37/// # Returns
38/// [`FregreLmResult`] with estimated coefficients, fitted values, and diagnostics
39///
40/// # Examples
41///
42/// ```
43/// use fdars_core::matrix::FdMatrix;
44/// use fdars_core::scalar_on_function::fregre_lm;
45///
46/// // 20 curves at 30 evaluation points (each curve has a different frequency)
47/// let (n, m) = (20, 30);
48/// let data = FdMatrix::from_column_major(
49///     (0..n * m).map(|k| {
50///         let i = (k % n) as f64;
51///         let j = (k / n) as f64;
52///         ((i + 1.0) * j * 0.2).sin()
53///     }).collect(),
54///     n, m,
55/// ).unwrap();
56/// let y: Vec<f64> = (0..n).map(|i| (i as f64 * 0.5).sin()).collect();
57/// let fit = fregre_lm(&data, &y, None, 3).unwrap();
58/// assert_eq!(fit.fitted_values.len(), 20);
59/// assert_eq!(fit.beta_t.len(), 30);
60/// assert!(fit.r_squared >= 0.0 && fit.r_squared <= 1.0);
61/// ```
62#[must_use = "expensive computation whose result should not be discarded"]
63pub fn fregre_lm(
64    data: &FdMatrix,
65    y: &[f64],
66    scalar_covariates: Option<&FdMatrix>,
67    ncomp: usize,
68) -> Result<FregreLmResult, FdarError> {
69    let (n, m) = data.shape();
70    validate_fregre_inputs(n, m, y, scalar_covariates)?;
71
72    let ncomp = resolve_ncomp(ncomp, data, y, scalar_covariates, n, m)?;
73
74    let fpca = fdata_to_pc_1d(data, ncomp)?;
75    let design = build_design_matrix(&fpca.scores, ncomp, scalar_covariates, n);
76    let p_total = design.ncols();
77    let (coeffs, hat_diag) = ols_solve(&design, y)?;
78
79    let fitted_values = compute_fitted(&design, &coeffs);
80    let residuals: Vec<f64> = y
81        .iter()
82        .zip(&fitted_values)
83        .map(|(&yi, &yh)| yi - yh)
84        .collect();
85    let (r_squared, r_squared_adj) = compute_r_squared(y, &residuals, p_total);
86
87    let ss_res: f64 = residuals.iter().map(|r| r * r).sum();
88    let df_resid = (n as f64 - p_total as f64).max(1.0);
89    let residual_se = (ss_res / df_resid).sqrt();
90    let sigma2 = ss_res / df_resid;
91
92    let xtx = compute_xtx(&design);
93    let l = cholesky_factor(&xtx, p_total).unwrap_or_else(|_| vec![1.0; p_total * p_total]);
94    let std_errors = compute_ols_std_errors(&l, p_total, sigma2);
95
96    let gcv = hat_diag
97        .iter()
98        .zip(&residuals)
99        .map(|(&h, &r)| (r / (1.0 - h).max(1e-10)).powi(2))
100        .sum::<f64>()
101        / n as f64;
102
103    let beta_t = recover_beta_t(&coeffs[1..=ncomp], &fpca.rotation, m);
104    let beta_se = compute_beta_se(&std_errors[1..=ncomp], &fpca.rotation, m);
105    let gamma: Vec<f64> = coeffs[1 + ncomp..].to_vec();
106
107    let nf = n as f64;
108    let rss = ss_res;
109    let aic = nf * (rss / nf).ln() + 2.0 * p_total as f64;
110    let bic = nf * (rss / nf).ln() + (nf).ln() * p_total as f64;
111
112    Ok(FregreLmResult {
113        intercept: coeffs[0],
114        beta_t,
115        beta_se,
116        gamma,
117        fitted_values,
118        residuals,
119        r_squared,
120        r_squared_adj,
121        std_errors,
122        ncomp,
123        fpca,
124        coefficients: coeffs,
125        residual_se,
126        gcv,
127        aic,
128        bic,
129    })
130}
131
132// ---------------------------------------------------------------------------
133// fregre_cv: Cross-validation for K selection
134// ---------------------------------------------------------------------------
135
136/// Copy a subset of rows from src into dst.
137fn copy_rows(dst: &mut FdMatrix, src: &FdMatrix, src_rows: &[usize]) {
138    let ncols = src.ncols();
139    for (dst_i, &src_i) in src_rows.iter().enumerate() {
140        for j in 0..ncols {
141            dst[(dst_i, j)] = src[(src_i, j)];
142        }
143    }
144}
145
146/// Compute CV error for a single K across all folds (randomized assignment).
147fn cv_error_for_k(
148    data: &FdMatrix,
149    y: &[f64],
150    scalar_covariates: Option<&FdMatrix>,
151    k: usize,
152    n_folds: usize,
153    folds: &[usize],
154) -> Result<f64, FdarError> {
155    let n = data.nrows();
156    let ncols = data.ncols();
157    let mut total_error = 0.0;
158    let mut count = 0;
159
160    for fold in 0..n_folds {
161        let train_idx: Vec<usize> = (0..n).filter(|&i| folds[i] != fold).collect();
162        let test_idx: Vec<usize> = (0..n).filter(|&i| folds[i] == fold).collect();
163        let n_test = test_idx.len();
164        if n_test == 0 || train_idx.len() < k + 2 {
165            continue;
166        }
167
168        let mut train_data = FdMatrix::zeros(train_idx.len(), ncols);
169        let mut test_data = FdMatrix::zeros(n_test, ncols);
170        copy_rows(&mut train_data, data, &train_idx);
171        copy_rows(&mut test_data, data, &test_idx);
172
173        let train_y: Vec<f64> = train_idx.iter().map(|&i| y[i]).collect();
174        let test_y: Vec<f64> = test_idx.iter().map(|&i| y[i]).collect();
175
176        let train_sc = scalar_covariates.map(|sc| {
177            let mut m = FdMatrix::zeros(train_idx.len(), sc.ncols());
178            copy_rows(&mut m, sc, &train_idx);
179            m
180        });
181        let test_sc = scalar_covariates.map(|sc| {
182            let mut m = FdMatrix::zeros(n_test, sc.ncols());
183            copy_rows(&mut m, sc, &test_idx);
184            m
185        });
186
187        let Ok(fit) = fregre_lm(&train_data, &train_y, train_sc.as_ref(), k) else {
188            continue;
189        };
190
191        let predictions = predict_fregre_lm(&fit, &test_data, test_sc.as_ref());
192        let fold_mse: f64 = predictions
193            .iter()
194            .zip(&test_y)
195            .map(|(&yhat, &yi)| (yhat - yi).powi(2))
196            .sum::<f64>()
197            / n_test as f64;
198
199        total_error += fold_mse * n_test as f64;
200        count += n_test;
201    }
202
203    if count > 0 {
204        Ok(total_error / count as f64)
205    } else {
206        Err(FdarError::ComputationFailed {
207            operation: "CV error computation",
208            detail: "no valid folds produced predictions; try reducing ncomp or increasing the number of observations".to_string(),
209        })
210    }
211}
212
213/// K-fold cross-validation for selecting the number of FPC components.
214///
215/// # Arguments
216/// * `data` - Functional predictor matrix (n × m)
217/// * `y` - Scalar response vector (length n)
218/// * `scalar_covariates` - Optional scalar covariates matrix
219/// * `k_min` - Minimum number of components to test
220/// * `k_max` - Maximum number of components to test
221/// * `n_folds` - Number of CV folds
222///
223/// # Errors
224///
225/// Returns [`FdarError::InvalidDimension`] if `data` has fewer rows than `n_folds`.
226/// Returns [`FdarError::InvalidParameter`] if `k_min < 1` or `k_min > k_max`.
227/// Returns [`FdarError::ComputationFailed`] if no candidate K value produces a
228/// valid CV error across all folds.
229#[must_use = "expensive computation whose result should not be discarded"]
230pub fn fregre_cv(
231    data: &FdMatrix,
232    y: &[f64],
233    scalar_covariates: Option<&FdMatrix>,
234    k_min: usize,
235    k_max: usize,
236    n_folds: usize,
237) -> Result<FregreCvResult, FdarError> {
238    let n = data.nrows();
239    if n < n_folds {
240        return Err(FdarError::InvalidDimension {
241            parameter: "data",
242            expected: format!("at least {n_folds} rows (n_folds)"),
243            actual: format!("{n}"),
244        });
245    }
246    if k_min < 1 || k_min > k_max {
247        return Err(FdarError::InvalidParameter {
248            parameter: "k_min/k_max",
249            message: format!("k_min={k_min} must be >= 1 and <= k_max={k_max}"),
250        });
251    }
252
253    let k_max = k_max.min(n - 2).min(data.ncols());
254
255    // Use randomized fold assignment (consistent seed for reproducibility)
256    let folds = create_folds(n, n_folds, 42);
257
258    let mut k_values = Vec::new();
259    let mut cv_errors = Vec::new();
260
261    for k in k_min..=k_max {
262        if let Ok(err) = cv_error_for_k(data, y, scalar_covariates, k, n_folds, &folds) {
263            k_values.push(k);
264            cv_errors.push(err);
265        }
266    }
267
268    if k_values.is_empty() {
269        return Err(FdarError::ComputationFailed {
270            operation: "fregre_cv",
271            detail: "no valid K values produced CV errors; all candidate ncomp values failed — check data for zero-variance columns or increase n".to_string(),
272        });
273    }
274
275    let (optimal_idx, &min_cv_error) = cv_errors
276        .iter()
277        .enumerate()
278        .min_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal))
279        .expect("checked: cv_errors is non-empty");
280
281    Ok(FregreCvResult {
282        k_values: k_values.clone(),
283        cv_errors,
284        optimal_k: k_values[optimal_idx],
285        min_cv_error,
286    })
287}
288
289// ---------------------------------------------------------------------------
290// Model selection
291// ---------------------------------------------------------------------------
292
293/// Select optimal ncomp for `fregre_lm` using AIC, BIC, or GCV.
294///
295/// Fits models for `ncomp = 1..=max_ncomp`, collects information criteria,
296/// and returns the best ncomp under the chosen criterion.
297///
298/// # Arguments
299/// * `data` — Functional predictor matrix (n × m)
300/// * `y` — Scalar responses (length n)
301/// * `scalar_covariates` — Optional scalar covariates (n × p)
302/// * `max_ncomp` — Maximum number of FPC components to try
303/// * `criterion` — Which criterion to minimise
304///
305/// # Errors
306///
307/// Returns [`FdarError::InvalidParameter`] if `max_ncomp` is zero.
308/// Returns [`FdarError::ComputationFailed`] if no candidate ncomp produces a
309/// valid fitted model (all calls to [`fregre_lm`] fail).
310#[must_use = "expensive computation whose result should not be discarded"]
311pub fn model_selection_ncomp(
312    data: &FdMatrix,
313    y: &[f64],
314    scalar_covariates: Option<&FdMatrix>,
315    max_ncomp: usize,
316    criterion: SelectionCriterion,
317) -> Result<ModelSelectionResult, FdarError> {
318    if max_ncomp == 0 {
319        return Err(FdarError::InvalidParameter {
320            parameter: "max_ncomp",
321            message: "must be >= 1".to_string(),
322        });
323    }
324
325    let mut criteria = Vec::with_capacity(max_ncomp);
326
327    for k in 1..=max_ncomp {
328        if let Ok(fit) = fregre_lm(data, y, scalar_covariates, k) {
329            criteria.push((k, fit.aic, fit.bic, fit.gcv));
330        }
331    }
332
333    if criteria.is_empty() {
334        return Err(FdarError::ComputationFailed {
335            operation: "model_selection_ncomp",
336            detail: "no valid models could be fitted; all candidate ncomp values failed — check data for degeneracy or reduce the range".to_string(),
337        });
338    }
339
340    let best_idx = match criterion {
341        SelectionCriterion::Aic => criteria
342            .iter()
343            .enumerate()
344            .min_by(|(_, a), (_, b)| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal))
345            .map_or(0, |(i, _)| i),
346        SelectionCriterion::Bic => criteria
347            .iter()
348            .enumerate()
349            .min_by(|(_, a), (_, b)| a.2.partial_cmp(&b.2).unwrap_or(std::cmp::Ordering::Equal))
350            .map_or(0, |(i, _)| i),
351        SelectionCriterion::Gcv => criteria
352            .iter()
353            .enumerate()
354            .min_by(|(_, a), (_, b)| a.3.partial_cmp(&b.3).unwrap_or(std::cmp::Ordering::Equal))
355            .map_or(0, |(i, _)| i),
356    };
357
358    Ok(ModelSelectionResult {
359        best_ncomp: criteria[best_idx].0,
360        criteria,
361    })
362}
363
364/// Predict new responses using a fitted functional linear model.
365///
366/// # Arguments
367/// * `fit` - A fitted [`FregreLmResult`]
368/// * `new_data` - New functional predictor matrix (n_new × m)
369/// * `new_scalar` - Optional new scalar covariates (n_new × p)
370///
371/// # Examples
372///
373/// ```
374/// use fdars_core::matrix::FdMatrix;
375/// use fdars_core::scalar_on_function::{fregre_lm, predict_fregre_lm};
376///
377/// let (n, m) = (20, 30);
378/// let data = FdMatrix::from_column_major(
379///     (0..n * m).map(|k| {
380///         let i = (k % n) as f64;
381///         let j = (k / n) as f64;
382///         ((i + 1.0) * j * 0.2).sin()
383///     }).collect(),
384///     n, m,
385/// ).unwrap();
386/// let y: Vec<f64> = (0..n).map(|i| (i as f64 * 0.5).sin()).collect();
387/// let fit = fregre_lm(&data, &y, None, 3).unwrap();
388///
389/// // Predict on same data
390/// let predictions = predict_fregre_lm(&fit, &data, None);
391/// assert_eq!(predictions.len(), 20);
392/// ```
393pub fn predict_fregre_lm(
394    fit: &FregreLmResult,
395    new_data: &FdMatrix,
396    new_scalar: Option<&FdMatrix>,
397) -> Vec<f64> {
398    let (n_new, m) = new_data.shape();
399    let ncomp = fit.ncomp;
400    let p_scalar = fit.gamma.len();
401
402    let mut predictions = vec![0.0; n_new];
403    for i in 0..n_new {
404        let mut yhat = fit.intercept;
405        for k in 0..ncomp {
406            let mut s = 0.0;
407            for j in 0..m {
408                s += (new_data[(i, j)] - fit.fpca.mean[j]) * fit.fpca.rotation[(j, k)];
409            }
410            yhat += fit.coefficients[1 + k] * s;
411        }
412        if let Some(sc) = new_scalar {
413            for j in 0..p_scalar {
414                yhat += fit.gamma[j] * sc[(i, j)];
415            }
416        }
417        predictions[i] = yhat;
418    }
419    predictions
420}