Skip to main content

fdars_core/classification/
cv.rs

1//! Cross-validation for functional classification.
2
3use crate::error::FdarError;
4use crate::matrix::FdMatrix;
5use crate::regression::fdata_to_pc_1d;
6
7use super::lda::{lda_params, lda_predict};
8use super::qda::{build_qda_params, qda_predict};
9use super::{remap_labels, ClassifCvResult};
10use crate::linalg::cholesky_d;
11
12/// K-fold cross-validated error rate for functional classification.
13///
14/// # Arguments
15/// * `data` — Functional data (n × m)
16/// * `argvals` — Evaluation points
17/// * `y` — Class labels
18/// * `scalar_covariates` — Optional scalar covariates
19/// * `method` — "lda", "qda", "knn", "kernel", "dd"
20/// * `ncomp` — Number of FPC components (for lda/qda/knn)
21/// * `nfold` — Number of CV folds
22/// * `seed` — Random seed for fold assignment
23///
24/// # Errors
25///
26/// Returns [`FdarError::InvalidParameter`] if `nfold < 2` or `nfold > n`.
27/// Returns [`FdarError::InvalidParameter`] if `y` contains fewer than 2 distinct classes.
28///
29/// # Examples
30///
31/// ```
32/// use fdars_core::matrix::FdMatrix;
33/// use fdars_core::classification::cv::fclassif_cv;
34///
35/// let argvals: Vec<f64> = (0..10).map(|i| i as f64 / 9.0).collect();
36/// let data = FdMatrix::from_column_major(
37///     (0..100).map(|i| (i as f64 * 0.1).sin()).collect(),
38///     10, 10,
39/// ).unwrap();
40/// let y = vec![0, 0, 0, 0, 0, 1, 1, 1, 1, 1];
41/// let result = fclassif_cv(&data, &argvals, &y, None, "lda", 2, 3, 42).unwrap();
42/// assert!(result.error_rate >= 0.0 && result.error_rate <= 1.0);
43/// ```
44#[must_use = "expensive computation whose result should not be discarded"]
45pub fn fclassif_cv(
46    data: &FdMatrix,
47    argvals: &[f64],
48    y: &[usize],
49    scalar_covariates: Option<&FdMatrix>,
50    method: &str,
51    ncomp: usize,
52    nfold: usize,
53    seed: u64,
54) -> Result<ClassifCvResult, FdarError> {
55    let n = data.nrows();
56    if n < nfold || nfold < 2 {
57        return Err(FdarError::InvalidParameter {
58            parameter: "nfold",
59            message: format!("need 2 <= nfold <= n, got nfold={nfold}, n={n}"),
60        });
61    }
62
63    let (labels, g) = remap_labels(y);
64    if g < 2 {
65        return Err(FdarError::InvalidParameter {
66            parameter: "y",
67            message: format!("need at least 2 classes, got {g}"),
68        });
69    }
70
71    // Assign folds
72    let folds = assign_folds(n, nfold, seed);
73
74    let mut fold_errors = Vec::with_capacity(nfold);
75
76    for fold in 0..nfold {
77        let (train_idx, test_idx) = fold_split(&folds, fold);
78        let train_data = extract_class_data(data, &train_idx);
79        let test_data = extract_class_data(data, &test_idx);
80        let train_labels: Vec<usize> = train_idx.iter().map(|&i| labels[i]).collect();
81        let test_labels: Vec<usize> = test_idx.iter().map(|&i| labels[i]).collect();
82
83        let train_cov = scalar_covariates.map(|c| extract_class_data(c, &train_idx));
84        let test_cov = scalar_covariates.map(|c| extract_class_data(c, &test_idx));
85
86        let predictions = cv_fold_predict(
87            &train_data,
88            &test_data,
89            argvals,
90            &train_labels,
91            g,
92            train_cov.as_ref(),
93            test_cov.as_ref(),
94            method,
95            ncomp,
96        );
97
98        let n_test = test_labels.len();
99        let errors = match predictions {
100            Some(pred) => {
101                let wrong = pred
102                    .iter()
103                    .zip(&test_labels)
104                    .filter(|(&p, &t)| p != t)
105                    .count();
106                wrong as f64 / n_test as f64
107            }
108            None => 1.0,
109        };
110        fold_errors.push(errors);
111    }
112
113    let error_rate = fold_errors.iter().sum::<f64>() / nfold as f64;
114
115    Ok(ClassifCvResult {
116        error_rate,
117        fold_errors,
118        best_ncomp: ncomp,
119    })
120}
121
122/// Assign observations to folds.
123pub(super) fn assign_folds(n: usize, nfold: usize, seed: u64) -> Vec<usize> {
124    use rand::prelude::*;
125    let mut rng = StdRng::seed_from_u64(seed);
126    let mut indices: Vec<usize> = (0..n).collect();
127    indices.shuffle(&mut rng);
128
129    let mut folds = vec![0usize; n];
130    for (rank, &idx) in indices.iter().enumerate() {
131        folds[idx] = rank % nfold;
132    }
133    folds
134}
135
136/// Split indices into train and test for given fold.
137pub(super) fn fold_split(folds: &[usize], fold: usize) -> (Vec<usize>, Vec<usize>) {
138    let train: Vec<usize> = (0..folds.len()).filter(|&i| folds[i] != fold).collect();
139    let test: Vec<usize> = (0..folds.len()).filter(|&i| folds[i] == fold).collect();
140    (train, test)
141}
142
143/// Predict on test set for one CV fold.
144fn cv_fold_predict(
145    train_data: &FdMatrix,
146    test_data: &FdMatrix,
147    _argvals: &[f64],
148    train_labels: &[usize],
149    g: usize,
150    train_cov: Option<&FdMatrix>,
151    test_cov: Option<&FdMatrix>,
152    method: &str,
153    ncomp: usize,
154) -> Option<Vec<usize>> {
155    let fpca = fdata_to_pc_1d(train_data, ncomp).ok()?;
156    match method {
157        "lda" => {
158            let predictions =
159                project_and_classify_lda(test_data, &fpca, train_labels, g, train_cov, test_cov);
160            Some(predictions)
161        }
162        "qda" => {
163            let predictions =
164                project_and_classify_qda(test_data, &fpca, train_labels, g, train_cov, test_cov);
165            Some(predictions)
166        }
167        "knn" => {
168            let predictions =
169                project_and_classify_knn(test_data, &fpca, train_labels, g, train_cov, test_cov, 5);
170            Some(predictions)
171        }
172        // kernel and dd classifiers don't support out-of-sample prediction on new data
173        _ => None,
174    }
175}
176
177/// Project test data onto FPCA basis (mean-center, multiply by rotation).
178pub(super) fn project_test_onto_fpca(
179    test_data: &FdMatrix,
180    fpca: &crate::regression::FpcaResult,
181) -> FdMatrix {
182    let n_test = test_data.nrows();
183    let m = test_data.ncols();
184    let d_pc = fpca.scores.ncols();
185    let mut test_features = FdMatrix::zeros(n_test, d_pc);
186    for i in 0..n_test {
187        for k in 0..d_pc {
188            let mut score = 0.0;
189            for j in 0..m {
190                score += (test_data[(i, j)] - fpca.mean[j]) * fpca.rotation[(j, k)];
191            }
192            test_features[(i, k)] = score;
193        }
194    }
195    test_features
196}
197
198/// Append scalar covariates to FPCA scores to form augmented feature matrix.
199fn append_scalar_covariates(scores: &FdMatrix, scalar_covariates: Option<&FdMatrix>) -> FdMatrix {
200    match scalar_covariates {
201        None => scores.clone(),
202        Some(cov) => {
203            let n = scores.nrows();
204            let d_pc = scores.ncols();
205            let d_cov = cov.ncols();
206            let mut features = FdMatrix::zeros(n, d_pc + d_cov);
207            for i in 0..n {
208                for j in 0..d_pc {
209                    features[(i, j)] = scores[(i, j)];
210                }
211                for j in 0..d_cov {
212                    features[(i, d_pc + j)] = cov[(i, j)];
213                }
214            }
215            features
216        }
217    }
218}
219
220/// Project test data onto training FPCA and classify with LDA.
221fn project_and_classify_lda(
222    test_data: &FdMatrix,
223    fpca: &crate::regression::FpcaResult,
224    train_labels: &[usize],
225    g: usize,
226    train_cov: Option<&FdMatrix>,
227    test_cov: Option<&FdMatrix>,
228) -> Vec<usize> {
229    let test_pc = project_test_onto_fpca(test_data, fpca);
230    let test_features = append_scalar_covariates(&test_pc, test_cov);
231
232    let train_features = append_scalar_covariates(&fpca.scores, train_cov);
233    let (class_means, cov, priors) = lda_params(&train_features, train_labels, g);
234    let d = train_features.ncols();
235    match cholesky_d(&cov, d) {
236        Ok(chol) => lda_predict(&test_features, &class_means, &chol, &priors, g),
237        Err(_) => vec![0; test_data.nrows()],
238    }
239}
240
241/// Project test data onto training FPCA and classify with QDA.
242fn project_and_classify_qda(
243    test_data: &FdMatrix,
244    fpca: &crate::regression::FpcaResult,
245    train_labels: &[usize],
246    g: usize,
247    train_cov: Option<&FdMatrix>,
248    test_cov: Option<&FdMatrix>,
249) -> Vec<usize> {
250    let n_test = test_data.nrows();
251    let test_pc = project_test_onto_fpca(test_data, fpca);
252    let test_features = append_scalar_covariates(&test_pc, test_cov);
253
254    let train_features = append_scalar_covariates(&fpca.scores, train_cov);
255
256    match build_qda_params(&train_features, train_labels, g) {
257        Ok((class_means, class_chols, class_log_dets, priors)) => qda_predict(
258            &test_features,
259            &class_means,
260            &class_chols,
261            &class_log_dets,
262            &priors,
263            g,
264        ),
265        Err(_) => vec![0; n_test],
266    }
267}
268
269/// Project test data and classify with k-NN.
270fn project_and_classify_knn(
271    test_data: &FdMatrix,
272    fpca: &crate::regression::FpcaResult,
273    train_labels: &[usize],
274    g: usize,
275    train_cov: Option<&FdMatrix>,
276    test_cov: Option<&FdMatrix>,
277    k_nn: usize,
278) -> Vec<usize> {
279    let n_test = test_data.nrows();
280    let n_train = fpca.scores.nrows();
281
282    let test_pc = project_test_onto_fpca(test_data, fpca);
283    let test_features = append_scalar_covariates(&test_pc, test_cov);
284    let train_features = append_scalar_covariates(&fpca.scores, train_cov);
285    let d = train_features.ncols();
286
287    (0..n_test)
288        .map(|i| {
289            // Distances to all training points in augmented feature space
290            let mut dists: Vec<(f64, usize)> = (0..n_train)
291                .map(|t| {
292                    let d_sq: f64 = (0..d)
293                        .map(|k| (test_features[(i, k)] - train_features[(t, k)]).powi(2))
294                        .sum();
295                    (d_sq, train_labels[t])
296                })
297                .collect();
298            let k_eff = k_nn.min(n_train);
299            if k_eff > 0 && k_eff < dists.len() {
300                dists.select_nth_unstable_by(k_eff - 1, |a, b| {
301                    a.0.partial_cmp(&b.0).unwrap_or(std::cmp::Ordering::Equal)
302                });
303            }
304
305            let mut votes = vec![0usize; g];
306            for &(_, label) in dists.iter().take(k_eff) {
307                votes[label] += 1;
308            }
309            votes
310                .iter()
311                .enumerate()
312                .max_by_key(|&(_, &v)| v)
313                .map_or(0, |(c, _)| c)
314        })
315        .collect()
316}
317
318/// Extract rows corresponding to given indices into a new FdMatrix.
319pub(super) fn extract_class_data(data: &FdMatrix, indices: &[usize]) -> FdMatrix {
320    let nc = indices.len();
321    let m = data.ncols();
322    let mut result = FdMatrix::zeros(nc, m);
323    for (ri, &i) in indices.iter().enumerate() {
324        for j in 0..m {
325            result[(ri, j)] = data[(i, j)];
326        }
327    }
328    result
329}