1use super::{
2 build_design_matrix, cholesky_factor, compute_beta_se, compute_fitted, compute_ols_std_errors,
3 compute_r_squared, compute_xtx, ols_solve, recover_beta_t, resolve_ncomp,
4 validate_fregre_inputs, FregreCvResult, FregreLmResult, ModelSelectionResult,
5 SelectionCriterion,
6};
7use crate::cv::create_folds;
8use crate::error::FdarError;
9use crate::matrix::FdMatrix;
10use crate::regression::fdata_to_pc_1d;
11
12#[must_use = "expensive computation whose result should not be discarded"]
63pub fn fregre_lm(
64 data: &FdMatrix,
65 y: &[f64],
66 scalar_covariates: Option<&FdMatrix>,
67 ncomp: usize,
68) -> Result<FregreLmResult, FdarError> {
69 let (n, m) = data.shape();
70 validate_fregre_inputs(n, m, y, scalar_covariates)?;
71
72 let ncomp = resolve_ncomp(ncomp, data, y, scalar_covariates, n, m)?;
73
74 let fpca = fdata_to_pc_1d(data, ncomp)?;
75 let design = build_design_matrix(&fpca.scores, ncomp, scalar_covariates, n);
76 let p_total = design.ncols();
77 let (coeffs, hat_diag) = ols_solve(&design, y)?;
78
79 let fitted_values = compute_fitted(&design, &coeffs);
80 let residuals: Vec<f64> = y
81 .iter()
82 .zip(&fitted_values)
83 .map(|(&yi, &yh)| yi - yh)
84 .collect();
85 let (r_squared, r_squared_adj) = compute_r_squared(y, &residuals, p_total);
86
87 let ss_res: f64 = residuals.iter().map(|r| r * r).sum();
88 let df_resid = (n as f64 - p_total as f64).max(1.0);
89 let residual_se = (ss_res / df_resid).sqrt();
90 let sigma2 = ss_res / df_resid;
91
92 let xtx = compute_xtx(&design);
93 let l = cholesky_factor(&xtx, p_total).unwrap_or_else(|_| vec![1.0; p_total * p_total]);
94 let std_errors = compute_ols_std_errors(&l, p_total, sigma2);
95
96 let gcv = hat_diag
97 .iter()
98 .zip(&residuals)
99 .map(|(&h, &r)| (r / (1.0 - h).max(1e-10)).powi(2))
100 .sum::<f64>()
101 / n as f64;
102
103 let beta_t = recover_beta_t(&coeffs[1..=ncomp], &fpca.rotation, m);
104 let beta_se = compute_beta_se(&std_errors[1..=ncomp], &fpca.rotation, m);
105 let gamma: Vec<f64> = coeffs[1 + ncomp..].to_vec();
106
107 let nf = n as f64;
108 let rss = ss_res;
109 let aic = nf * (rss / nf).ln() + 2.0 * p_total as f64;
110 let bic = nf * (rss / nf).ln() + (nf).ln() * p_total as f64;
111
112 Ok(FregreLmResult {
113 intercept: coeffs[0],
114 beta_t,
115 beta_se,
116 gamma,
117 fitted_values,
118 residuals,
119 r_squared,
120 r_squared_adj,
121 std_errors,
122 ncomp,
123 fpca,
124 coefficients: coeffs,
125 residual_se,
126 gcv,
127 aic,
128 bic,
129 })
130}
131
132fn copy_rows(dst: &mut FdMatrix, src: &FdMatrix, src_rows: &[usize]) {
138 let ncols = src.ncols();
139 for (dst_i, &src_i) in src_rows.iter().enumerate() {
140 for j in 0..ncols {
141 dst[(dst_i, j)] = src[(src_i, j)];
142 }
143 }
144}
145
146fn cv_error_for_k(
148 data: &FdMatrix,
149 y: &[f64],
150 scalar_covariates: Option<&FdMatrix>,
151 k: usize,
152 n_folds: usize,
153 folds: &[usize],
154) -> Result<f64, FdarError> {
155 let n = data.nrows();
156 let ncols = data.ncols();
157 let mut total_error = 0.0;
158 let mut count = 0;
159
160 for fold in 0..n_folds {
161 let train_idx: Vec<usize> = (0..n).filter(|&i| folds[i] != fold).collect();
162 let test_idx: Vec<usize> = (0..n).filter(|&i| folds[i] == fold).collect();
163 let n_test = test_idx.len();
164 if n_test == 0 || train_idx.len() < k + 2 {
165 continue;
166 }
167
168 let mut train_data = FdMatrix::zeros(train_idx.len(), ncols);
169 let mut test_data = FdMatrix::zeros(n_test, ncols);
170 copy_rows(&mut train_data, data, &train_idx);
171 copy_rows(&mut test_data, data, &test_idx);
172
173 let train_y: Vec<f64> = train_idx.iter().map(|&i| y[i]).collect();
174 let test_y: Vec<f64> = test_idx.iter().map(|&i| y[i]).collect();
175
176 let train_sc = scalar_covariates.map(|sc| {
177 let mut m = FdMatrix::zeros(train_idx.len(), sc.ncols());
178 copy_rows(&mut m, sc, &train_idx);
179 m
180 });
181 let test_sc = scalar_covariates.map(|sc| {
182 let mut m = FdMatrix::zeros(n_test, sc.ncols());
183 copy_rows(&mut m, sc, &test_idx);
184 m
185 });
186
187 let Ok(fit) = fregre_lm(&train_data, &train_y, train_sc.as_ref(), k) else {
188 continue;
189 };
190
191 let predictions = predict_fregre_lm(&fit, &test_data, test_sc.as_ref());
192 let fold_mse: f64 = predictions
193 .iter()
194 .zip(&test_y)
195 .map(|(&yhat, &yi)| (yhat - yi).powi(2))
196 .sum::<f64>()
197 / n_test as f64;
198
199 total_error += fold_mse * n_test as f64;
200 count += n_test;
201 }
202
203 if count > 0 {
204 Ok(total_error / count as f64)
205 } else {
206 Err(FdarError::ComputationFailed {
207 operation: "CV error computation",
208 detail: "no valid folds produced predictions; try reducing ncomp or increasing the number of observations".to_string(),
209 })
210 }
211}
212
213#[must_use = "expensive computation whose result should not be discarded"]
230pub fn fregre_cv(
231 data: &FdMatrix,
232 y: &[f64],
233 scalar_covariates: Option<&FdMatrix>,
234 k_min: usize,
235 k_max: usize,
236 n_folds: usize,
237) -> Result<FregreCvResult, FdarError> {
238 let n = data.nrows();
239 if n < n_folds {
240 return Err(FdarError::InvalidDimension {
241 parameter: "data",
242 expected: format!("at least {n_folds} rows (n_folds)"),
243 actual: format!("{n}"),
244 });
245 }
246 if k_min < 1 || k_min > k_max {
247 return Err(FdarError::InvalidParameter {
248 parameter: "k_min/k_max",
249 message: format!("k_min={k_min} must be >= 1 and <= k_max={k_max}"),
250 });
251 }
252
253 let k_max = k_max.min(n - 2).min(data.ncols());
254
255 let folds = create_folds(n, n_folds, 42);
257
258 let mut k_values = Vec::new();
259 let mut cv_errors = Vec::new();
260
261 for k in k_min..=k_max {
262 if let Ok(err) = cv_error_for_k(data, y, scalar_covariates, k, n_folds, &folds) {
263 k_values.push(k);
264 cv_errors.push(err);
265 }
266 }
267
268 if k_values.is_empty() {
269 return Err(FdarError::ComputationFailed {
270 operation: "fregre_cv",
271 detail: "no valid K values produced CV errors; all candidate ncomp values failed — check data for zero-variance columns or increase n".to_string(),
272 });
273 }
274
275 let (optimal_idx, &min_cv_error) = cv_errors
276 .iter()
277 .enumerate()
278 .min_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal))
279 .expect("checked: cv_errors is non-empty");
280
281 Ok(FregreCvResult {
282 k_values: k_values.clone(),
283 cv_errors,
284 optimal_k: k_values[optimal_idx],
285 min_cv_error,
286 })
287}
288
289#[must_use = "expensive computation whose result should not be discarded"]
311pub fn model_selection_ncomp(
312 data: &FdMatrix,
313 y: &[f64],
314 scalar_covariates: Option<&FdMatrix>,
315 max_ncomp: usize,
316 criterion: SelectionCriterion,
317) -> Result<ModelSelectionResult, FdarError> {
318 if max_ncomp == 0 {
319 return Err(FdarError::InvalidParameter {
320 parameter: "max_ncomp",
321 message: "must be >= 1".to_string(),
322 });
323 }
324
325 let mut criteria = Vec::with_capacity(max_ncomp);
326
327 for k in 1..=max_ncomp {
328 if let Ok(fit) = fregre_lm(data, y, scalar_covariates, k) {
329 criteria.push((k, fit.aic, fit.bic, fit.gcv));
330 }
331 }
332
333 if criteria.is_empty() {
334 return Err(FdarError::ComputationFailed {
335 operation: "model_selection_ncomp",
336 detail: "no valid models could be fitted; all candidate ncomp values failed — check data for degeneracy or reduce the range".to_string(),
337 });
338 }
339
340 let best_idx = match criterion {
341 SelectionCriterion::Aic => criteria
342 .iter()
343 .enumerate()
344 .min_by(|(_, a), (_, b)| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal))
345 .map_or(0, |(i, _)| i),
346 SelectionCriterion::Bic => criteria
347 .iter()
348 .enumerate()
349 .min_by(|(_, a), (_, b)| a.2.partial_cmp(&b.2).unwrap_or(std::cmp::Ordering::Equal))
350 .map_or(0, |(i, _)| i),
351 SelectionCriterion::Gcv => criteria
352 .iter()
353 .enumerate()
354 .min_by(|(_, a), (_, b)| a.3.partial_cmp(&b.3).unwrap_or(std::cmp::Ordering::Equal))
355 .map_or(0, |(i, _)| i),
356 };
357
358 Ok(ModelSelectionResult {
359 best_ncomp: criteria[best_idx].0,
360 criteria,
361 })
362}
363
364pub fn predict_fregre_lm(
394 fit: &FregreLmResult,
395 new_data: &FdMatrix,
396 new_scalar: Option<&FdMatrix>,
397) -> Vec<f64> {
398 let (n_new, m) = new_data.shape();
399 let ncomp = fit.ncomp;
400 let p_scalar = fit.gamma.len();
401
402 let mut predictions = vec![0.0; n_new];
403 for i in 0..n_new {
404 let mut yhat = fit.intercept;
405 for k in 0..ncomp {
406 let mut s = 0.0;
407 for j in 0..m {
408 s += (new_data[(i, j)] - fit.fpca.mean[j]) * fit.fpca.rotation[(j, k)];
409 }
410 yhat += fit.coefficients[1 + k] * s;
411 }
412 if let Some(sc) = new_scalar {
413 for j in 0..p_scalar {
414 yhat += fit.gamma[j] * sc[(i, j)];
415 }
416 }
417 predictions[i] = yhat;
418 }
419 predictions
420}