1#![allow(clippy::needless_range_loop)]
6use super::types::{
7 CrossValResult, DecisionTree, IsolationNode, KMeansResult, LinearRegression, Matrix, OobResult,
8 PcaResult, TreeNode, TsneResult,
9};
10
11#[allow(missing_docs)]
12pub fn dot(a: &[f64], b: &[f64]) -> f64 {
13 a.iter().zip(b.iter()).map(|(x, y)| x * y).sum()
14}
15#[allow(dead_code, missing_docs)]
16pub fn vec_add(a: &[f64], b: &[f64]) -> Vec<f64> {
17 a.iter().zip(b.iter()).map(|(x, y)| x + y).collect()
18}
19#[allow(missing_docs)]
20pub fn vec_sub(a: &[f64], b: &[f64]) -> Vec<f64> {
21 a.iter().zip(b.iter()).map(|(x, y)| x - y).collect()
22}
23#[allow(dead_code, missing_docs)]
24pub fn vec_scale(a: &[f64], s: f64) -> Vec<f64> {
25 a.iter().map(|x| x * s).collect()
26}
27#[allow(missing_docs)]
28pub fn vec_norm(a: &[f64]) -> f64 {
29 dot(a, a).sqrt()
30}
31#[allow(missing_docs)]
32pub fn mean_vec(data: &[Vec<f64>]) -> Vec<f64> {
33 if data.is_empty() {
34 return Vec::new();
35 }
36 let d = data[0].len();
37 let n = data.len() as f64;
38 let mut m = vec![0.0f64; d];
39 for row in data {
40 for (j, &v) in row.iter().enumerate() {
41 m[j] += v;
42 }
43 }
44 m.iter().map(|x| x / n).collect()
45}
46#[allow(missing_docs)]
47pub fn variance_vec(data: &[Vec<f64>]) -> Vec<f64> {
48 let m = mean_vec(data);
49 let n = data.len() as f64;
50 let d = m.len();
51 let mut var = vec![0.0f64; d];
52 for row in data {
53 for j in 0..d {
54 var[j] += (row[j] - m[j]).powi(2);
55 }
56 }
57 var.iter().map(|x| x / n).collect()
58}
59#[allow(dead_code, missing_docs)]
60pub fn standardize(data: &[Vec<f64>]) -> (Vec<Vec<f64>>, Vec<f64>, Vec<f64>) {
61 let mu = mean_vec(data);
62 let var = variance_vec(data);
63 let std: Vec<f64> = var.iter().map(|v| v.sqrt().max(1e-300)).collect();
64 let normed = data
65 .iter()
66 .map(|row| {
67 row.iter()
68 .zip(mu.iter())
69 .zip(std.iter())
70 .map(|((x, m), s)| (x - m) / s)
71 .collect()
72 })
73 .collect();
74 (normed, mu, std)
75}
76pub fn solve_linear_system(a: &Matrix, b: &[f64]) -> Option<Vec<f64>> {
78 let n = a.rows;
79 assert_eq!(a.cols, n);
80 assert_eq!(b.len(), n);
81 let mut m = a.clone();
82 let mut rhs = b.to_vec();
83 for col in 0..n {
84 let max_row = (col..n).max_by(|&i, &j| {
85 m.get(i, col)
86 .abs()
87 .partial_cmp(&m.get(j, col).abs())
88 .unwrap_or(std::cmp::Ordering::Equal)
89 })?;
90 for c in 0..n {
91 let tmp = m.get(col, c);
92 m.set(col, c, m.get(max_row, c));
93 m.set(max_row, c, tmp);
94 }
95 rhs.swap(col, max_row);
96 let pivot = m.get(col, col);
97 if pivot.abs() < 1e-15 {
98 return None;
99 }
100 for row in col + 1..n {
101 let factor = m.get(row, col) / pivot;
102 for c in col..n {
103 let val = m.get(row, c) - factor * m.get(col, c);
104 m.set(row, c, val);
105 }
106 rhs[row] -= factor * rhs[col];
107 }
108 }
109 let mut x = vec![0.0f64; n];
110 for i in (0..n).rev() {
111 let mut sum = rhs[i];
112 for j in i + 1..n {
113 sum -= m.get(i, j) * x[j];
114 }
115 x[i] = sum / m.get(i, i);
116 }
117 Some(x)
118}
119#[allow(missing_docs)]
120pub fn sigmoid(z: f64) -> f64 {
121 1.0 / (1.0 + (-z).exp())
122}
123#[allow(dead_code, missing_docs)]
124pub fn sigmoid_vec(z: &[f64]) -> Vec<f64> {
125 z.iter().map(|&v| sigmoid(v)).collect()
126}
127pub fn kmeans(data: &[Vec<f64>], k: usize, max_iter: usize, tol: f64) -> KMeansResult {
129 let n = data.len();
130 let d = data[0].len();
131 assert!(k <= n, "k must be <= number of data points");
132 let mut centroids: Vec<Vec<f64>> = Vec::with_capacity(k);
133 centroids.push(data[0].clone());
134 for _ in 1..k {
135 let dists: Vec<f64> = data
136 .iter()
137 .map(|p| {
138 centroids
139 .iter()
140 .map(|c| {
141 p.iter()
142 .zip(c.iter())
143 .map(|(a, b)| (a - b).powi(2))
144 .sum::<f64>()
145 })
146 .fold(f64::MAX, f64::min)
147 })
148 .collect();
149 let total: f64 = dists.iter().sum();
150 let max_idx = dists
151 .iter()
152 .enumerate()
153 .max_by(|a, b| a.1.partial_cmp(b.1).unwrap_or(std::cmp::Ordering::Equal))
154 .map(|(i, _)| i)
155 .unwrap_or(0);
156 let _ = total;
157 centroids.push(data[max_idx].clone());
158 }
159 let mut labels = vec![0usize; n];
160 let mut prev_inertia = f64::MAX;
161 for _ in 0..max_iter {
162 let mut changed = false;
163 for (i, point) in data.iter().enumerate() {
164 let nearest = centroids
165 .iter()
166 .enumerate()
167 .min_by(|(_, a), (_, b)| {
168 let da: f64 = point
169 .iter()
170 .zip(a.iter())
171 .map(|(x, y)| (x - y).powi(2))
172 .sum();
173 let db: f64 = point
174 .iter()
175 .zip(b.iter())
176 .map(|(x, y)| (x - y).powi(2))
177 .sum();
178 da.partial_cmp(&db).unwrap_or(std::cmp::Ordering::Equal)
179 })
180 .map(|(i, _)| i)
181 .unwrap_or(0);
182 if labels[i] != nearest {
183 changed = true;
184 }
185 labels[i] = nearest;
186 }
187 for c in centroids.iter_mut() {
188 *c = vec![0.0; d];
189 }
190 let mut counts = vec![0usize; k];
191 for (i, point) in data.iter().enumerate() {
192 let c = labels[i];
193 for j in 0..d {
194 centroids[c][j] += point[j];
195 }
196 counts[c] += 1;
197 }
198 for (c, centroid) in centroids.iter_mut().enumerate() {
199 let cnt = counts[c].max(1) as f64;
200 for v in centroid.iter_mut() {
201 *v /= cnt;
202 }
203 }
204 let inertia: f64 = data
205 .iter()
206 .zip(labels.iter())
207 .map(|(p, &l)| {
208 p.iter()
209 .zip(centroids[l].iter())
210 .map(|(a, b)| (a - b).powi(2))
211 .sum::<f64>()
212 })
213 .sum();
214 if !changed || (prev_inertia - inertia).abs() < tol {
215 break;
216 }
217 prev_inertia = inertia;
218 }
219 let inertia: f64 = data
220 .iter()
221 .zip(labels.iter())
222 .map(|(p, &l)| {
223 p.iter()
224 .zip(centroids[l].iter())
225 .map(|(a, b)| (a - b).powi(2))
226 .sum::<f64>()
227 })
228 .sum();
229 KMeansResult {
230 centroids,
231 labels,
232 inertia,
233 }
234}
235pub fn pca_fit(data: &[Vec<f64>], n_components: usize) -> PcaResult {
239 let n = data.len() as f64;
240 let d = data[0].len();
241 let mean = mean_vec(data);
242 let centred: Vec<Vec<f64>> = data
243 .iter()
244 .map(|row| row.iter().zip(mean.iter()).map(|(x, m)| x - m).collect())
245 .collect();
246 let mut cov = Matrix::zeros(d, d);
247 for row in ¢red {
248 for i in 0..d {
249 for j in 0..d {
250 let val = cov.get(i, j) + row[i] * row[j];
251 cov.set(i, j, val);
252 }
253 }
254 }
255 for v in cov.data.iter_mut() {
256 *v /= n;
257 }
258 let mut components = Vec::new();
259 let mut eigenvalues = Vec::new();
260 let mut deflated = cov.clone();
261 for _ in 0..n_components.min(d) {
262 let (eigval, eigvec) = power_iteration(&deflated, 100, 1e-10);
263 eigenvalues.push(eigval);
264 components.push(eigvec.clone());
265 for i in 0..d {
266 for j in 0..d {
267 let val = deflated.get(i, j) - eigval * eigvec[i] * eigvec[j];
268 deflated.set(i, j, val);
269 }
270 }
271 }
272 PcaResult {
273 components,
274 eigenvalues,
275 mean,
276 }
277}
278pub fn power_iteration(m: &Matrix, max_iter: usize, tol: f64) -> (f64, Vec<f64>) {
280 let n = m.rows;
281 let mut v: Vec<f64> = vec![1.0; n];
282 let norm = vec_norm(&v);
283 for x in v.iter_mut() {
284 *x /= norm;
285 }
286 let mut eigenval = 0.0f64;
287 for _ in 0..max_iter {
288 let mv = m.matvec(&v);
289 let new_eigenval = dot(&v, &mv);
290 let norm_mv = vec_norm(&mv).max(1e-300);
291 let new_v: Vec<f64> = mv.iter().map(|x| x / norm_mv).collect();
292 let diff = vec_sub(&new_v, &v);
293 v = new_v;
294 if (new_eigenval - eigenval).abs() < tol && vec_norm(&diff) < tol {
295 eigenval = new_eigenval;
296 break;
297 }
298 eigenval = new_eigenval;
299 }
300 (eigenval, v)
301}
302pub fn gini_impurity(labels: &[f64]) -> f64 {
304 if labels.is_empty() {
305 return 0.0;
306 }
307 let n = labels.len() as f64;
308 let mut counts = std::collections::HashMap::new();
309 for &l in labels {
310 *counts.entry(l as i64).or_insert(0usize) += 1;
311 }
312 1.0 - counts
313 .values()
314 .map(|&c| (c as f64 / n).powi(2))
315 .sum::<f64>()
316}
317pub fn best_split(x: &[Vec<f64>], y: &[f64], min_samples: usize) -> Option<(usize, f64, f64)> {
319 let n_features = x[0].len();
320 let n = x.len();
321 let mut best_gain = -f64::INFINITY;
322 let mut best_feat = 0;
323 let mut best_thresh = 0.0;
324 let parent_gini = gini_impurity(y);
325 for feat in 0..n_features {
326 let mut values: Vec<f64> = x.iter().map(|r| r[feat]).collect();
327 values.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
328 values.dedup();
329 for thresh in values.windows(2).map(|w| (w[0] + w[1]) / 2.0) {
330 let left_y: Vec<f64> = x
331 .iter()
332 .zip(y.iter())
333 .filter(|(r, _)| r[feat] <= thresh)
334 .map(|(_, &l)| l)
335 .collect();
336 let right_y: Vec<f64> = x
337 .iter()
338 .zip(y.iter())
339 .filter(|(r, _)| r[feat] > thresh)
340 .map(|(_, &l)| l)
341 .collect();
342 if left_y.len() < min_samples || right_y.len() < min_samples {
343 continue;
344 }
345 let gain = parent_gini
346 - (left_y.len() as f64 / n as f64) * gini_impurity(&left_y)
347 - (right_y.len() as f64 / n as f64) * gini_impurity(&right_y);
348 if gain > best_gain {
349 best_gain = gain;
350 best_feat = feat;
351 best_thresh = thresh;
352 }
353 }
354 }
355 if best_gain > 0.0 {
356 Some((best_feat, best_thresh, best_gain))
357 } else {
358 None
359 }
360}
361#[allow(missing_docs)]
362pub fn majority_vote(y: &[f64]) -> f64 {
363 let mut counts = std::collections::HashMap::new();
364 for &l in y {
365 *counts.entry(l as i64).or_insert(0usize) += 1;
366 }
367 counts
368 .into_iter()
369 .max_by_key(|&(_, c)| c)
370 .map(|(l, _)| l as f64)
371 .unwrap_or(0.0)
372}
373#[allow(missing_docs)]
374pub fn build_tree(
375 x: &[Vec<f64>],
376 y: &[f64],
377 depth: usize,
378 max_depth: usize,
379 min_samples: usize,
380) -> TreeNode {
381 if depth >= max_depth || y.len() < 2 * min_samples || gini_impurity(y) == 0.0 {
382 return TreeNode::Leaf {
383 value: majority_vote(y),
384 };
385 }
386 if let Some((feat, thresh, _)) = best_split(x, y, min_samples) {
387 let (lx, ly): (Vec<_>, Vec<_>) = x
388 .iter()
389 .zip(y.iter())
390 .filter(|(r, _)| r[feat] <= thresh)
391 .map(|(r, &l)| (r.clone(), l))
392 .unzip();
393 let (rx, ry): (Vec<_>, Vec<_>) = x
394 .iter()
395 .zip(y.iter())
396 .filter(|(r, _)| r[feat] > thresh)
397 .map(|(r, &l)| (r.clone(), l))
398 .unzip();
399 if lx.is_empty() || rx.is_empty() {
400 return TreeNode::Leaf {
401 value: majority_vote(y),
402 };
403 }
404 TreeNode::Split {
405 feature: feat,
406 threshold: thresh,
407 left: Box::new(build_tree(&lx, &ly, depth + 1, max_depth, min_samples)),
408 right: Box::new(build_tree(&rx, &ry, depth + 1, max_depth, min_samples)),
409 }
410 } else {
411 TreeNode::Leaf {
412 value: majority_vote(y),
413 }
414 }
415}
416pub fn cross_val_linear_regression(x: &[Vec<f64>], y: &[f64], k: usize) -> CrossValResult {
418 let n = x.len();
419 let fold_size = n / k;
420 let mut scores = Vec::new();
421 for fold in 0..k {
422 let val_start = fold * fold_size;
423 let val_end = ((fold + 1) * fold_size).min(n);
424 let train_x: Vec<Vec<f64>> = x[..val_start]
425 .iter()
426 .chain(x[val_end..].iter())
427 .cloned()
428 .collect();
429 let train_y: Vec<f64> = y[..val_start]
430 .iter()
431 .chain(y[val_end..].iter())
432 .cloned()
433 .collect();
434 let val_x = &x[val_start..val_end];
435 let val_y = &y[val_start..val_end];
436 if train_x.is_empty() || val_x.is_empty() {
437 continue;
438 }
439 let model = LinearRegression::fit_normal(&train_x, &train_y);
440 scores.push(model.r2_score(val_x, val_y));
441 }
442 let mean = scores.iter().sum::<f64>() / scores.len() as f64;
443 let std = (scores.iter().map(|s| (s - mean).powi(2)).sum::<f64>() / scores.len() as f64).sqrt();
444 CrossValResult {
445 fold_scores: scores,
446 mean_score: mean,
447 std_score: std,
448 }
449}
450pub fn confusion_matrix(y_true: &[usize], y_pred: &[usize], n_classes: usize) -> Vec<Vec<usize>> {
454 let mut cm = vec![vec![0usize; n_classes]; n_classes];
455 for (&t, &p) in y_true.iter().zip(y_pred.iter()) {
456 if t < n_classes && p < n_classes {
457 cm[t][p] += 1;
458 }
459 }
460 cm
461}
462pub fn precision_recall_f1(cm: &[Vec<usize>]) -> (f64, f64, f64) {
464 if cm.len() < 2 {
465 return (0.0, 0.0, 0.0);
466 }
467 let tp = cm[1][1] as f64;
468 let fp = cm[0][1] as f64;
469 let fn_ = cm[1][0] as f64;
470 let precision = if tp + fp > 0.0 { tp / (tp + fp) } else { 0.0 };
471 let recall = if tp + fn_ > 0.0 { tp / (tp + fn_) } else { 0.0 };
472 let f1 = if precision + recall > 0.0 {
473 2.0 * precision * recall / (precision + recall)
474 } else {
475 0.0
476 };
477 (precision, recall, f1)
478}
479pub fn accuracy_from_cm(cm: &[Vec<usize>]) -> f64 {
481 let correct: usize = (0..cm.len()).map(|i| cm[i][i]).sum();
482 let total: usize = cm.iter().flat_map(|r| r.iter()).sum();
483 if total == 0 {
484 0.0
485 } else {
486 correct as f64 / total as f64
487 }
488}
489pub fn roc_auc(scores: &[f64], labels: &[usize]) -> f64 {
493 let n = scores.len();
494 let mut pairs: Vec<(f64, usize)> = scores.iter().cloned().zip(labels.iter().cloned()).collect();
495 pairs.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
496 let n_pos = labels.iter().filter(|&&l| l == 1).count() as f64;
497 let n_neg = (n - n_pos as usize) as f64;
498 if n_pos == 0.0 || n_neg == 0.0 {
499 return 0.5;
500 }
501 let mut auc = 0.0;
502 let mut fp = 0.0;
503 let mut tp_prev = 0.0;
504 let mut fp_prev = 0.0;
505 for &(_, label) in &pairs {
506 if label == 1 {
507 tp_prev += 1.0;
508 } else {
509 fp += 1.0;
510 }
511 auc += (fp / n_neg - fp_prev / n_neg) * tp_prev / n_pos;
512 fp_prev = fp;
513 }
514 auc
515}
516pub fn mae(y_true: &[f64], y_pred: &[f64]) -> f64 {
518 y_true
519 .iter()
520 .zip(y_pred.iter())
521 .map(|(t, p)| (t - p).abs())
522 .sum::<f64>()
523 / y_true.len() as f64
524}
525pub fn mse(y_true: &[f64], y_pred: &[f64]) -> f64 {
527 y_true
528 .iter()
529 .zip(y_pred.iter())
530 .map(|(t, p)| (t - p).powi(2))
531 .sum::<f64>()
532 / y_true.len() as f64
533}
534pub fn rmse(y_true: &[f64], y_pred: &[f64]) -> f64 {
536 mse(y_true, y_pred).sqrt()
537}
538pub fn r2(y_true: &[f64], y_pred: &[f64]) -> f64 {
540 let mean_y = y_true.iter().sum::<f64>() / y_true.len() as f64;
541 let ss_res: f64 = y_true
542 .iter()
543 .zip(y_pred.iter())
544 .map(|(t, p)| (t - p).powi(2))
545 .sum();
546 let ss_tot: f64 = y_true.iter().map(|t| (t - mean_y).powi(2)).sum();
547 if ss_tot == 0.0 {
548 1.0
549 } else {
550 1.0 - ss_res / ss_tot
551 }
552}
553#[cfg(test)]
554mod tests {
555 use super::*;
556 use crate::machine_learning::Activation;
557 use crate::machine_learning::Adam;
558 use crate::machine_learning::BatchNorm;
559 use crate::machine_learning::DenseLayer;
560 use crate::machine_learning::GaussianNaiveBayes;
561
562 use crate::machine_learning::KNearestNeighbours;
563
564 use crate::machine_learning::LogisticRegression;
565 use crate::machine_learning::MinMaxScaler;
566 use crate::machine_learning::RandomForest;
567
568 use crate::machine_learning::RmsProp;
569 use crate::machine_learning::StandardScaler;
570
571 fn approx_eq(a: f64, b: f64, tol: f64) -> bool {
572 (a - b).abs() < tol
573 }
574 #[test]
575 fn test_matrix_zeros() {
576 let m = Matrix::zeros(2, 3);
577 assert_eq!(m.rows, 2);
578 assert_eq!(m.cols, 3);
579 assert!(m.data.iter().all(|&v| v == 0.0));
580 }
581 #[test]
582 fn test_matrix_eye() {
583 let m = Matrix::eye(3);
584 assert_eq!(m.get(0, 0), 1.0);
585 assert_eq!(m.get(0, 1), 0.0);
586 assert_eq!(m.get(1, 1), 1.0);
587 }
588 #[test]
589 fn test_matrix_matmul() {
590 let a = Matrix::from_vec(2, 2, vec![1.0, 2.0, 3.0, 4.0]);
591 let b = Matrix::from_vec(2, 2, vec![1.0, 0.0, 0.0, 1.0]);
592 let c = a.matmul(&b);
593 assert_eq!(c.data, a.data);
594 }
595 #[test]
596 fn test_matrix_transpose() {
597 let a = Matrix::from_vec(2, 3, vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0]);
598 let at = a.transpose();
599 assert_eq!(at.rows, 3);
600 assert_eq!(at.cols, 2);
601 assert_eq!(at.get(1, 0), 2.0);
602 }
603 #[test]
604 fn test_matrix_matvec() {
605 let a = Matrix::from_vec(2, 2, vec![1.0, 0.0, 0.0, 2.0]);
606 let v = vec![3.0, 4.0];
607 let out = a.matvec(&v);
608 assert_eq!(out, vec![3.0, 8.0]);
609 }
610 #[test]
611 fn test_linear_regression_perfect_fit() {
612 let x: Vec<Vec<f64>> = (0..5).map(|i| vec![i as f64]).collect();
613 let y: Vec<f64> = (0..5).map(|i| 2.0 * i as f64 + 1.0).collect();
614 let model = LinearRegression::fit_normal(&x, &y);
615 assert!(approx_eq(model.r2_score(&x, &y), 1.0, 1e-8));
616 }
617 #[test]
618 fn test_linear_regression_gradient_descent() {
619 let x: Vec<Vec<f64>> = (0..20).map(|i| vec![i as f64 / 10.0]).collect();
620 let y: Vec<f64> = x.iter().map(|r| 3.0 * r[0] + 1.0).collect();
621 let model = LinearRegression::fit_gradient_descent(&x, &y, 0.05, 2000);
622 let r2 = model.r2_score(&x, &y);
623 assert!(r2 > 0.95);
624 }
625 #[test]
626 fn test_linear_regression_predict() {
627 let x = vec![vec![0.0], vec![1.0], vec![2.0]];
628 let y = vec![0.0, 1.0, 2.0];
629 let model = LinearRegression::fit_normal(&x, &y);
630 let p = model.predict_one(&[3.0]);
631 assert!(approx_eq(p, 3.0, 0.01));
632 }
633 #[test]
634 fn test_logistic_regression_linearly_separable() {
635 let x: Vec<Vec<f64>> = (0..20).map(|i| vec![i as f64 - 10.0]).collect();
636 let y: Vec<f64> = (0..20).map(|i| if i < 10 { 0.0 } else { 1.0 }).collect();
637 let model = LogisticRegression::fit(&x, &y, 0.5, 1000);
638 let acc = model.accuracy(&x, &y);
639 assert!(acc >= 0.9);
640 }
641 #[test]
642 fn test_logistic_regression_sigmoid() {
643 assert!(approx_eq(sigmoid(0.0), 0.5, 1e-10));
644 assert!(sigmoid(10.0) > 0.99);
645 assert!(sigmoid(-10.0) < 0.01);
646 }
647 #[test]
648 fn test_kmeans_two_clusters() {
649 let mut data: Vec<Vec<f64>> = (0..10).map(|i| vec![i as f64]).collect();
650 data.extend((100..110).map(|i| vec![i as f64]));
651 let result = kmeans(&data, 2, 100, 1e-6);
652 assert_eq!(result.centroids.len(), 2);
653 assert_eq!(result.labels.len(), 20);
654 let c0 = result.labels[0];
655 for &l in &result.labels[..10] {
656 assert_eq!(l, c0);
657 }
658 }
659 #[test]
660 fn test_kmeans_inertia_positive() {
661 let data: Vec<Vec<f64>> = (0..10).map(|i| vec![i as f64, i as f64]).collect();
662 let result = kmeans(&data, 3, 50, 1e-6);
663 assert!(result.inertia >= 0.0);
664 }
665 #[test]
666 fn test_pca_fit_shape() {
667 let data: Vec<Vec<f64>> = (0..20)
668 .map(|i| vec![i as f64, 2.0 * i as f64, 3.0 * i as f64])
669 .collect();
670 let pca = pca_fit(&data, 2);
671 assert_eq!(pca.components.len(), 2);
672 assert_eq!(pca.components[0].len(), 3);
673 }
674 #[test]
675 fn test_pca_transform_shape() {
676 let data: Vec<Vec<f64>> = (0..10).map(|i| vec![i as f64, i as f64 * 2.0]).collect();
677 let pca = pca_fit(&data, 1);
678 let proj = pca.transform(&data);
679 assert_eq!(proj.len(), 10);
680 assert_eq!(proj[0].len(), 1);
681 }
682 #[test]
683 fn test_relu() {
684 let x = vec![-1.0, 0.0, 2.0, -3.0, 4.0];
685 let y = Activation::relu(&x);
686 assert_eq!(y, vec![0.0, 0.0, 2.0, 0.0, 4.0]);
687 }
688 #[test]
689 fn test_softmax_sums_to_one() {
690 let x = vec![1.0, 2.0, 3.0];
691 let s = Activation::softmax(&x);
692 let sum: f64 = s.iter().sum();
693 assert!(approx_eq(sum, 1.0, 1e-10));
694 }
695 #[test]
696 fn test_sigmoid_symmetry() {
697 assert!(approx_eq(sigmoid(1.0) + sigmoid(-1.0), 1.0, 1e-10));
698 }
699 #[test]
700 fn test_dense_layer_forward_shape() {
701 let mut layer = DenseLayer::new(4, 3);
702 let x = vec![1.0, 0.0, -1.0, 0.5];
703 let out = layer.forward(&x);
704 assert_eq!(out.len(), 3);
705 }
706 #[test]
707 fn test_dense_layer_backward_shape() {
708 let mut layer = DenseLayer::new(3, 2);
709 let x = vec![1.0, 2.0, 3.0];
710 let _ = layer.forward(&x);
711 let grad_out = vec![0.1, -0.2];
712 let grad_in = layer.backward(&grad_out, 0.01);
713 assert_eq!(grad_in.len(), 3);
714 }
715 #[test]
716 fn test_batch_norm_output_shape() {
717 let mut bn = BatchNorm::new(3);
718 let batch = vec![vec![1.0, 2.0, 3.0], vec![4.0, 5.0, 6.0]];
719 let out = bn.forward_batch(&batch);
720 assert_eq!(out.len(), 2);
721 assert_eq!(out[0].len(), 3);
722 }
723 #[test]
724 fn test_adam_step_decreases_params() {
725 let mut params = vec![1.0, 1.0];
726 let grads = vec![0.5, 0.5];
727 let mut opt = Adam::new(2, 0.01, 0.9, 0.999, 1e-8);
728 opt.step(&mut params, &grads);
729 assert!(params[0] < 1.0);
730 }
731 #[test]
732 fn test_rmsprop_step() {
733 let mut params = vec![1.0];
734 let grads = vec![1.0];
735 let mut opt = RmsProp::new(1, 0.01, 0.9, 1e-8);
736 opt.step(&mut params, &grads);
737 assert!(params[0] < 1.0);
738 }
739 #[test]
740 fn test_decision_tree_xor() {
741 let x = vec![
742 vec![0.0, 0.0],
743 vec![0.5, 0.0],
744 vec![0.0, 0.5],
745 vec![1.0, 0.0],
746 vec![1.0, 1.0],
747 vec![0.5, 1.0],
748 ];
749 let y = vec![0.0, 0.0, 0.0, 1.0, 1.0, 1.0];
750 let mut tree = DecisionTree::new(4, 1);
751 tree.fit(&x, &y);
752 let acc = tree.accuracy(&x, &y);
753 assert!(acc >= 0.75);
754 }
755 #[test]
756 fn test_decision_tree_linearly_separable() {
757 let x: Vec<Vec<f64>> = (0..20).map(|i| vec![i as f64]).collect();
758 let y: Vec<f64> = (0..20).map(|i| if i < 10 { 0.0 } else { 1.0 }).collect();
759 let mut tree = DecisionTree::new(3, 1);
760 tree.fit(&x, &y);
761 let acc = tree.accuracy(&x, &y);
762 assert!(acc >= 0.9);
763 }
764 #[test]
765 fn test_random_forest_accuracy() {
766 let x: Vec<Vec<f64>> = (0..20).map(|i| vec![i as f64, (i % 2) as f64]).collect();
767 let y: Vec<f64> = (0..20).map(|i| if i < 10 { 0.0 } else { 1.0 }).collect();
768 let mut rf = RandomForest::new(5, 3, 1);
769 rf.fit(&x, &y);
770 let acc = rf.accuracy(&x, &y);
771 assert!(acc >= 0.8);
772 }
773 #[test]
774 fn test_confusion_matrix() {
775 let y_true = vec![0, 1, 1, 0, 1];
776 let y_pred = vec![0, 1, 0, 0, 1];
777 let cm = confusion_matrix(&y_true, &y_pred, 2);
778 assert_eq!(cm[1][1], 2);
779 assert_eq!(cm[1][0], 1);
780 }
781 #[test]
782 fn test_precision_recall_f1() {
783 let cm = vec![vec![2usize, 0], vec![0, 3]];
784 let (p, r, f1) = precision_recall_f1(&cm);
785 assert!(approx_eq(p, 1.0, 1e-10));
786 assert!(approx_eq(r, 1.0, 1e-10));
787 assert!(approx_eq(f1, 1.0, 1e-10));
788 }
789 #[test]
790 fn test_accuracy_from_cm() {
791 let cm = vec![vec![3usize, 1], vec![2, 4]];
792 let acc = accuracy_from_cm(&cm);
793 assert!(approx_eq(acc, 7.0 / 10.0, 1e-10));
794 }
795 #[test]
796 fn test_mae() {
797 let y = vec![1.0, 2.0, 3.0];
798 let p = vec![1.0, 2.0, 4.0];
799 assert!(approx_eq(mae(&y, &p), 1.0 / 3.0, 1e-10));
800 }
801 #[test]
802 fn test_rmse() {
803 let y = vec![0.0, 0.0];
804 let p = vec![1.0, 1.0];
805 assert!(approx_eq(rmse(&y, &p), 1.0, 1e-10));
806 }
807 #[test]
808 fn test_r2_perfect() {
809 let y = vec![1.0, 2.0, 3.0, 4.0];
810 assert!(approx_eq(r2(&y, &y), 1.0, 1e-10));
811 }
812 #[test]
813 fn test_min_max_scaler() {
814 let data = vec![vec![0.0, 10.0], vec![2.0, 20.0], vec![4.0, 30.0]];
815 let scaler = MinMaxScaler::fit(&data);
816 let t = scaler.transform(&data);
817 assert!(approx_eq(t[0][0], 0.0, 1e-10));
818 assert!(approx_eq(t[2][0], 1.0, 1e-10));
819 let inv = scaler.inverse_transform(&t);
820 assert!(approx_eq(inv[1][1], 20.0, 1e-8));
821 }
822 #[test]
823 fn test_standard_scaler() {
824 let data = vec![vec![2.0], vec![4.0], vec![6.0]];
825 let scaler = StandardScaler::fit(&data);
826 let t = scaler.transform(&data);
827 let mean: f64 = t.iter().map(|r| r[0]).sum::<f64>() / 3.0;
828 assert!(mean.abs() < 1e-8);
829 }
830 #[test]
831 fn test_gnb_fit_predict() {
832 let x = vec![vec![1.0], vec![2.0], vec![10.0], vec![11.0]];
833 let y = vec![0, 0, 1, 1];
834 let gnb = GaussianNaiveBayes::fit(&x, &y, 2);
835 assert_eq!(gnb.predict_one(&[1.5]), 0);
836 assert_eq!(gnb.predict_one(&[10.5]), 1);
837 }
838 #[test]
839 fn test_knn_predict() {
840 let x_train = vec![vec![0.0], vec![1.0], vec![10.0], vec![11.0]];
841 let y_train = vec![0.0, 0.0, 1.0, 1.0];
842 let mut knn = KNearestNeighbours::new(2);
843 knn.fit(x_train, y_train);
844 assert!(approx_eq(knn.predict_one(&[0.5]), 0.0, 0.5));
845 assert!(approx_eq(knn.predict_one(&[10.5]), 1.0, 0.5));
846 }
847 #[test]
848 fn test_cross_val_linear() {
849 let x: Vec<Vec<f64>> = (0..20).map(|i| vec![i as f64]).collect();
850 let y: Vec<f64> = (0..20).map(|i| 2.0 * i as f64).collect();
851 let result = cross_val_linear_regression(&x, &y, 4);
852 assert!(result.mean_score > 0.9);
853 }
854 #[test]
855 fn test_solve_linear_system() {
856 let a = Matrix::from_vec(2, 2, vec![2.0, 1.0, 1.0, 3.0]);
857 let b = vec![5.0, 10.0];
858 let x = solve_linear_system(&a, &b).unwrap();
859 assert!(approx_eq(x[0], 1.0, 1e-8));
860 assert!(approx_eq(x[1], 3.0, 1e-8));
861 }
862 #[test]
863 fn test_roc_auc_perfect() {
864 let scores = vec![0.9, 0.8, 0.2, 0.1];
865 let labels = vec![1, 1, 0, 0];
866 let auc = roc_auc(&scores, &labels);
867 assert!(approx_eq(auc, 1.0, 1e-8));
868 }
869 #[test]
870 fn test_roc_auc_random() {
871 let scores = vec![0.5, 0.5, 0.5, 0.5];
872 let labels = vec![1, 0, 1, 0];
873 let auc = roc_auc(&scores, &labels);
874 assert!((0.0..=1.0).contains(&auc));
875 }
876}
877pub fn random_forest_oob(x: &[Vec<f64>], y: &[f64], n_trees: usize, max_depth: usize) -> OobResult {
882 let n = x.len();
883 let mut all_votes: Vec<Vec<f64>> = vec![Vec::new(); n];
884 let mut bootstrap_sets = Vec::with_capacity(n_trees);
885 for t in 0..n_trees {
886 let mut seed = (t as u64 + 17).wrapping_mul(6364136223846793005);
887 let mut indices = Vec::with_capacity(n);
888 for _ in 0..n {
889 seed = seed
890 .wrapping_mul(6364136223846793005)
891 .wrapping_add(1442695040888963407);
892 indices.push((seed >> 33) as usize % n);
893 }
894 let in_bag: std::collections::HashSet<usize> = indices.iter().cloned().collect();
895 let bx: Vec<Vec<f64>> = indices.iter().map(|&i| x[i].clone()).collect();
896 let by: Vec<f64> = indices.iter().map(|&i| y[i]).collect();
897 let mut tree = DecisionTree::new(max_depth, 1);
898 tree.fit(&bx, &by);
899 for i in 0..n {
900 if !in_bag.contains(&i) {
901 all_votes[i].push(tree.predict_one(&x[i]));
902 }
903 }
904 bootstrap_sets.push(indices);
905 }
906 let oob_predictions: Vec<f64> = all_votes
907 .iter()
908 .map(|votes| {
909 if votes.is_empty() {
910 0.0
911 } else {
912 majority_vote(votes)
913 }
914 })
915 .collect();
916 let correct = oob_predictions
917 .iter()
918 .zip(y.iter())
919 .filter(|(p, t)| (*p - *t).abs() < 0.5)
920 .count();
921 let oob_accuracy = correct as f64 / n as f64;
922 OobResult {
923 oob_predictions,
924 oob_accuracy,
925 bootstrap_sets,
926 }
927}
928pub fn feature_bagging(d: usize, k: usize, seed: u64) -> Vec<usize> {
932 let k = k.min(d);
933 let mut rng_state = seed.wrapping_mul(6364136223846793005).wrapping_add(1);
934 let mut indices: Vec<usize> = (0..d).collect();
935 for i in 0..k {
936 rng_state = rng_state
937 .wrapping_mul(6364136223846793005)
938 .wrapping_add(1442695040888963407);
939 let j = i + (rng_state >> 33) as usize % (d - i);
940 indices.swap(i, j);
941 }
942 indices[..k].to_vec()
943}
944pub fn rbf_kernel(x1: &[f64], x2: &[f64], amplitude: f64, length_scale: f64) -> f64 {
948 let sq_dist: f64 = x1.iter().zip(x2.iter()).map(|(a, b)| (a - b).powi(2)).sum();
949 amplitude * amplitude * (-sq_dist / (2.0 * length_scale * length_scale)).exp()
950}
951pub fn kernel_matrix(x: &[Vec<f64>], amplitude: f64, length_scale: f64) -> Matrix {
953 let n = x.len();
954 let mut km = Matrix::zeros(n, n);
955 for i in 0..n {
956 for j in 0..n {
957 km.set(i, j, rbf_kernel(&x[i], &x[j], amplitude, length_scale));
958 }
959 }
960 km
961}
962pub fn cholesky(a: &Matrix) -> Matrix {
966 let n = a.rows;
967 let mut l = Matrix::zeros(n, n);
968 for i in 0..n {
969 for j in 0..=i {
970 let mut sum = a.get(i, j);
971 for k in 0..j {
972 sum -= l.get(i, k) * l.get(j, k);
973 }
974 if i == j {
975 l.set(i, j, (sum.max(1e-300)).sqrt());
976 } else {
977 let ljj = l.get(j, j).max(1e-300);
978 l.set(i, j, sum / ljj);
979 }
980 }
981 }
982 l
983}
984pub fn forward_substitution(l: &Matrix, b: &[f64]) -> Vec<f64> {
986 let n = l.rows;
987 let mut x = vec![0.0f64; n];
988 for i in 0..n {
989 let mut s = b[i];
990 for j in 0..i {
991 s -= l.get(i, j) * x[j];
992 }
993 x[i] = s / l.get(i, i).max(1e-300);
994 }
995 x
996}
997pub fn backward_substitution_t(l: &Matrix, b: &[f64]) -> Vec<f64> {
999 let n = l.rows;
1000 let mut x = vec![0.0f64; n];
1001 for i in (0..n).rev() {
1002 let mut s = b[i];
1003 for j in i + 1..n {
1004 s -= l.get(j, i) * x[j];
1005 }
1006 x[i] = s / l.get(i, i).max(1e-300);
1007 }
1008 x
1009}
1010pub fn average_path_length(n: usize) -> f64 {
1014 if n <= 1 {
1015 return 0.0;
1016 }
1017 let nf = n as f64;
1018 let h_n = (nf - 1.0).ln() + 0.5772156649;
1019 2.0 * h_n - 2.0 * (nf - 1.0) / nf
1020}
1021pub fn build_isolation_tree(
1023 x: &[Vec<f64>],
1024 max_depth: usize,
1025 depth: usize,
1026 seed: &mut u64,
1027) -> IsolationNode {
1028 let n = x.len();
1029 if n <= 1 || depth >= max_depth {
1030 return IsolationNode::Leaf { size: n };
1031 }
1032 let d = x[0].len();
1033 *seed = seed
1034 .wrapping_mul(6364136223846793005)
1035 .wrapping_add(1442695040888963407);
1036 let feat = (*seed >> 33) as usize % d;
1037 let mut min_v = x[0][feat];
1038 let mut max_v = min_v;
1039 for row in x.iter() {
1040 if row[feat] < min_v {
1041 min_v = row[feat];
1042 }
1043 if row[feat] > max_v {
1044 max_v = row[feat];
1045 }
1046 }
1047 if (max_v - min_v).abs() < 1e-15 {
1048 return IsolationNode::Leaf { size: n };
1049 }
1050 *seed = seed
1051 .wrapping_mul(6364136223846793005)
1052 .wrapping_add(1442695040888963407);
1053 let frac = (*seed >> 11) as f64 / (1u64 << 53) as f64;
1054 let threshold = min_v + frac * (max_v - min_v);
1055 let (left_x, right_x): (Vec<Vec<f64>>, Vec<Vec<f64>>) =
1056 x.iter().cloned().partition(|row| row[feat] <= threshold);
1057 IsolationNode::Internal {
1058 feature: feat,
1059 threshold,
1060 left: Box::new(build_isolation_tree(&left_x, max_depth, depth + 1, seed)),
1061 right: Box::new(build_isolation_tree(&right_x, max_depth, depth + 1, seed)),
1062 }
1063}
1064pub fn reach_dist(p: &[f64], o: &[f64], k_dist_o: f64) -> f64 {
1066 let d = p
1067 .iter()
1068 .zip(o.iter())
1069 .map(|(a, b)| (a - b).powi(2))
1070 .sum::<f64>()
1071 .sqrt();
1072 d.max(k_dist_o)
1073}
1074pub fn lof_scores(x: &[Vec<f64>], k: usize) -> Vec<f64> {
1078 let n = x.len();
1079 if n == 0 {
1080 return Vec::new();
1081 }
1082 let dist = |a: &[f64], b: &[f64]| -> f64 {
1083 a.iter()
1084 .zip(b.iter())
1085 .map(|(u, v)| (u - v).powi(2))
1086 .sum::<f64>()
1087 .sqrt()
1088 };
1089 let mut k_dists = vec![0.0f64; n];
1090 let mut neighbours: Vec<Vec<usize>> = vec![Vec::new(); n];
1091 for i in 0..n {
1092 let mut dists: Vec<(f64, usize)> = (0..n)
1093 .filter(|&j| j != i)
1094 .map(|j| (dist(&x[i], &x[j]), j))
1095 .collect();
1096 dists.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap_or(std::cmp::Ordering::Equal));
1097 let kk = k.min(dists.len());
1098 k_dists[i] = dists.get(kk.saturating_sub(1)).map(|d| d.0).unwrap_or(0.0);
1099 neighbours[i] = dists[..kk].iter().map(|d| d.1).collect();
1100 }
1101 let mut lrd = vec![0.0f64; n];
1102 for i in 0..n {
1103 let nb = &neighbours[i];
1104 if nb.is_empty() {
1105 lrd[i] = 1.0;
1106 continue;
1107 }
1108 let sum_reach: f64 = nb
1109 .iter()
1110 .map(|&j| reach_dist(&x[i], &x[j], k_dists[j]))
1111 .sum();
1112 lrd[i] = nb.len() as f64 / sum_reach.max(1e-300);
1113 }
1114 (0..n)
1115 .map(|i| {
1116 let nb = &neighbours[i];
1117 if nb.is_empty() {
1118 return 1.0;
1119 }
1120 let avg_lrd: f64 = nb.iter().map(|&j| lrd[j]).sum::<f64>() / nb.len() as f64;
1121 avg_lrd / lrd[i].max(1e-300)
1122 })
1123 .collect()
1124}
1125pub fn compute_pairwise_affinities(x: &[Vec<f64>], perplexity: f64) -> Vec<Vec<f64>> {
1130 let n = x.len();
1131 let target_entropy = perplexity.ln();
1132 let mut p_matrix = vec![vec![0.0f64; n]; n];
1133 for i in 0..n {
1134 let mut beta_lo = 0.0f64;
1135 let mut beta_hi = f64::INFINITY;
1136 let mut beta = 1.0f64;
1137 let dists: Vec<f64> = (0..n)
1138 .map(|j| {
1139 if j == i {
1140 0.0
1141 } else {
1142 x[i].iter()
1143 .zip(x[j].iter())
1144 .map(|(a, b)| (a - b).powi(2))
1145 .sum()
1146 }
1147 })
1148 .collect();
1149 for _ in 0..50 {
1150 let exps: Vec<f64> = dists
1151 .iter()
1152 .enumerate()
1153 .map(|(j, &d)| if j == i { 0.0 } else { (-beta * d).exp() })
1154 .collect();
1155 let sum_exps: f64 = exps.iter().sum::<f64>().max(1e-300);
1156 let probs: Vec<f64> = exps.iter().map(|&e| e / sum_exps).collect();
1157 let h: f64 = probs
1158 .iter()
1159 .zip(dists.iter())
1160 .filter(|(p, _)| **p > 1e-300)
1161 .map(|(p, d)| -p * (p.ln()) + beta * p * d)
1162 .sum();
1163 if (h - target_entropy).abs() < 1e-5 {
1164 p_matrix[i][..n].copy_from_slice(&probs[..n]);
1165 break;
1166 }
1167 if h > target_entropy {
1168 beta_lo = beta;
1169 beta = if beta_hi.is_infinite() {
1170 beta * 2.0
1171 } else {
1172 (beta + beta_hi) / 2.0
1173 };
1174 } else {
1175 beta_hi = beta;
1176 beta = (beta + beta_lo) / 2.0;
1177 }
1178 }
1179 let exps: Vec<f64> = dists
1180 .iter()
1181 .enumerate()
1182 .map(|(j, &d)| if j == i { 0.0 } else { (-beta * d).exp() })
1183 .collect();
1184 let sum_exps: f64 = exps.iter().sum::<f64>().max(1e-300);
1185 for j in 0..n {
1186 p_matrix[i][j] = exps[j] / sum_exps;
1187 }
1188 }
1189 let total_n = (n * n) as f64;
1190 let mut p_sym = vec![vec![0.0f64; n]; n];
1191 for i in 0..n {
1192 for j in 0..n {
1193 p_sym[i][j] = (p_matrix[i][j] + p_matrix[j][i]) / total_n;
1194 }
1195 }
1196 p_sym
1197}
1198pub fn tsne(x: &[Vec<f64>], perplexity: f64, n_iter: usize, lr: f64) -> TsneResult {
1208 let n = x.len();
1209 if n == 0 {
1210 return TsneResult {
1211 embedding: Vec::new(),
1212 kl_divergence: 0.0,
1213 };
1214 }
1215 let p = compute_pairwise_affinities(x, perplexity);
1216 let mut y: Vec<[f64; 2]> = (0..n)
1217 .map(|i| {
1218 let v = (i as f64 * 1.3).sin() * 0.01;
1219 let w = (i as f64 * 2.7).cos() * 0.01;
1220 [v, w]
1221 })
1222 .collect();
1223 let mut gains = vec![[1.0f64; 2]; n];
1224 let mut velocity = vec![[0.0f64; 2]; n];
1225 let momentum = 0.8;
1226 for _iter in 0..n_iter {
1227 let mut q_num = vec![vec![0.0f64; n]; n];
1228 let mut z = 0.0f64;
1229 for i in 0..n {
1230 for j in (i + 1)..n {
1231 let diff = [y[i][0] - y[j][0], y[i][1] - y[j][1]];
1232 let qval = 1.0 / (1.0 + diff[0] * diff[0] + diff[1] * diff[1]);
1233 q_num[i][j] = qval;
1234 q_num[j][i] = qval;
1235 z += 2.0 * qval;
1236 }
1237 }
1238 let z = z.max(1e-300);
1239 let mut grad = vec![[0.0f64; 2]; n];
1240 for i in 0..n {
1241 for j in 0..n {
1242 if i == j {
1243 continue;
1244 }
1245 let q = q_num[i][j] / z;
1246 let factor = 4.0 * (p[i][j] - q) * q_num[i][j];
1247 let diff = [y[i][0] - y[j][0], y[i][1] - y[j][1]];
1248 grad[i][0] += factor * diff[0];
1249 grad[i][1] += factor * diff[1];
1250 }
1251 }
1252 for i in 0..n {
1253 for d in 0..2 {
1254 gains[i][d] = if (grad[i][d] > 0.0) == (velocity[i][d] > 0.0) {
1255 (gains[i][d] * 0.8).max(0.01)
1256 } else {
1257 gains[i][d] + 0.2
1258 };
1259 velocity[i][d] = momentum * velocity[i][d] - lr * gains[i][d] * grad[i][d];
1260 y[i][d] += velocity[i][d];
1261 }
1262 }
1263 }
1264 let mut q_num = vec![vec![0.0f64; n]; n];
1265 let mut z = 0.0f64;
1266 for i in 0..n {
1267 for j in (i + 1)..n {
1268 let diff = [y[i][0] - y[j][0], y[i][1] - y[j][1]];
1269 let qval = 1.0 / (1.0 + diff[0] * diff[0] + diff[1] * diff[1]);
1270 q_num[i][j] = qval;
1271 q_num[j][i] = qval;
1272 z += 2.0 * qval;
1273 }
1274 }
1275 let z = z.max(1e-300);
1276 let kl: f64 = (0..n)
1277 .flat_map(|i| (0..n).map(move |j| (i, j)))
1278 .filter(|&(i, j)| i != j && p[i][j] > 1e-300)
1279 .map(|(i, j)| {
1280 let q = (q_num[i][j] / z).max(1e-300);
1281 p[i][j] * (p[i][j] / q).ln()
1282 })
1283 .sum();
1284 TsneResult {
1285 embedding: y,
1286 kl_divergence: kl,
1287 }
1288}
1289pub fn multiclass_prf(cm: &[Vec<usize>]) -> Vec<(usize, f64, f64, f64)> {
1293 let n_classes = cm.len();
1294 (0..n_classes)
1295 .map(|c| {
1296 let tp = cm[c][c] as f64;
1297 let fp: f64 = (0..n_classes)
1298 .filter(|&r| r != c)
1299 .map(|r| cm[r][c] as f64)
1300 .sum();
1301 let fn_: f64 = (0..n_classes)
1302 .filter(|&col| col != c)
1303 .map(|col| cm[c][col] as f64)
1304 .sum();
1305 let prec = if tp + fp > 0.0 { tp / (tp + fp) } else { 0.0 };
1306 let rec = if tp + fn_ > 0.0 { tp / (tp + fn_) } else { 0.0 };
1307 let f1 = if prec + rec > 0.0 {
1308 2.0 * prec * rec / (prec + rec)
1309 } else {
1310 0.0
1311 };
1312 (c, prec, rec, f1)
1313 })
1314 .collect()
1315}
1316pub fn macro_f1(cm: &[Vec<usize>]) -> f64 {
1318 let prf = multiclass_prf(cm);
1319 if prf.is_empty() {
1320 return 0.0;
1321 }
1322 prf.iter().map(|(_, _, _, f1)| f1).sum::<f64>() / prf.len() as f64
1323}
1324pub fn weighted_f1(cm: &[Vec<usize>]) -> f64 {
1326 let prf = multiclass_prf(cm);
1327 let n_classes = cm.len();
1328 let supports: Vec<f64> = (0..n_classes)
1329 .map(|c| cm[c].iter().sum::<usize>() as f64)
1330 .collect();
1331 let total: f64 = supports.iter().sum();
1332 if total == 0.0 {
1333 return 0.0;
1334 }
1335 prf.iter()
1336 .zip(supports.iter())
1337 .map(|((_, _, _, f1), &s)| f1 * s)
1338 .sum::<f64>()
1339 / total
1340}
1341pub fn roc_curve(scores: &[f64], labels: &[usize]) -> (Vec<f64>, Vec<f64>) {
1343 let n_pos = labels.iter().filter(|&&l| l == 1).count() as f64;
1344 let n_neg = (labels.len() as f64 - n_pos).max(1.0);
1345 let mut pairs: Vec<(f64, usize)> = scores.iter().cloned().zip(labels.iter().cloned()).collect();
1346 pairs.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
1347 let mut tpr = vec![0.0f64];
1348 let mut fpr = vec![0.0f64];
1349 let mut tp = 0.0f64;
1350 let mut fp = 0.0f64;
1351 for (_, label) in &pairs {
1352 if *label == 1 {
1353 tp += 1.0;
1354 } else {
1355 fp += 1.0;
1356 }
1357 tpr.push(tp / n_pos);
1358 fpr.push(fp / n_neg);
1359 }
1360 (fpr, tpr)
1361}
1362pub fn auc_trapezoid(x: &[f64], y: &[f64]) -> f64 {
1364 x.windows(2)
1365 .zip(y.windows(2))
1366 .map(|(xs, ys)| 0.5 * (xs[1] - xs[0]).abs() * (ys[0] + ys[1]))
1367 .sum()
1368}
1369#[cfg(test)]
1370mod expanded_ml_tests {
1371
1372 use crate::machine_learning::GaussianProcessRegressor;
1373 use crate::machine_learning::GradientBoosting;
1374 use crate::machine_learning::IsolationForest;
1375
1376 use crate::machine_learning::KnnMetric;
1377 use crate::machine_learning::KnnModel;
1378
1379 use crate::machine_learning::RegressionStump;
1380
1381 use crate::machine_learning::auc_trapezoid;
1382 use crate::machine_learning::average_path_length;
1383 use crate::machine_learning::feature_bagging;
1384 use crate::machine_learning::lof_scores;
1385 use crate::machine_learning::multiclass_prf;
1386 use crate::machine_learning::random_forest_oob;
1387 use crate::machine_learning::rbf_kernel;
1388 use crate::machine_learning::roc_curve;
1389 use crate::machine_learning::tsne;
1390 fn approx(a: f64, b: f64, tol: f64) -> bool {
1391 (a - b).abs() < tol
1392 }
1393 #[test]
1394 fn test_stump_constant_target() {
1395 let x = vec![vec![0.0], vec![1.0], vec![2.0], vec![3.0]];
1396 let r = vec![1.0, 1.0, 1.0, 1.0];
1397 let stump = RegressionStump::fit(&x, &r);
1398 assert!(approx(stump.predict_one(&[0.5]), 1.0, 1e-6));
1399 assert!(approx(stump.predict_one(&[2.5]), 1.0, 1e-6));
1400 }
1401 #[test]
1402 fn test_stump_split() {
1403 let x = vec![vec![0.0], vec![1.0], vec![5.0], vec![6.0]];
1404 let r = vec![0.0, 0.0, 1.0, 1.0];
1405 let stump = RegressionStump::fit(&x, &r);
1406 assert!(stump.predict_one(&[0.5]) < 0.5);
1407 assert!(stump.predict_one(&[5.5]) > 0.5);
1408 }
1409 #[test]
1410 fn test_gbdt_mse_decreases() {
1411 let x: Vec<Vec<f64>> = (0..20).map(|i| vec![i as f64]).collect();
1412 let y: Vec<f64> = (0..20).map(|i| (i as f64) * 0.5).collect();
1413 let gb1 = GradientBoosting::fit(&x, &y, 5, 0.3);
1414 let gb2 = GradientBoosting::fit(&x, &y, 50, 0.3);
1415 assert!(gb2.train_mse(&x, &y) < gb1.train_mse(&x, &y) + 0.5);
1416 }
1417 #[test]
1418 fn test_gbdt_predict_len() {
1419 let x = vec![vec![1.0], vec![2.0], vec![3.0]];
1420 let y = vec![1.0, 2.0, 3.0];
1421 let gb = GradientBoosting::fit(&x, &y, 10, 0.1);
1422 let preds = gb.predict(&x);
1423 assert_eq!(preds.len(), 3);
1424 }
1425 #[test]
1426 fn test_feature_bagging_len() {
1427 let selected = feature_bagging(10, 3, 42);
1428 assert_eq!(selected.len(), 3);
1429 }
1430 #[test]
1431 fn test_feature_bagging_clamped() {
1432 let selected = feature_bagging(4, 10, 7);
1433 assert_eq!(selected.len(), 4);
1434 }
1435 #[test]
1436 fn test_oob_accuracy_separable() {
1437 let x: Vec<Vec<f64>> = (0..30).map(|i| vec![i as f64]).collect();
1438 let y: Vec<f64> = (0..30).map(|i| if i < 15 { 0.0 } else { 1.0 }).collect();
1439 let oob = random_forest_oob(&x, &y, 10, 4);
1440 assert!(oob.oob_accuracy >= 0.5, "oob acc={}", oob.oob_accuracy);
1441 }
1442 #[test]
1443 fn test_oob_len() {
1444 let x = vec![vec![0.0], vec![1.0], vec![2.0], vec![3.0]];
1445 let y = vec![0.0, 0.0, 1.0, 1.0];
1446 let oob = random_forest_oob(&x, &y, 4, 3);
1447 assert_eq!(oob.oob_predictions.len(), 4);
1448 }
1449 #[test]
1450 fn test_knn_model_classify_euclidean() {
1451 let mut knn = KnnModel::new(3, KnnMetric::Euclidean, false);
1452 knn.fit(
1453 vec![
1454 vec![0.0],
1455 vec![0.5],
1456 vec![1.0],
1457 vec![9.0],
1458 vec![9.5],
1459 vec![10.0],
1460 ],
1461 vec![0.0, 0.0, 0.0, 1.0, 1.0, 1.0],
1462 );
1463 assert!(approx(knn.classify(&[0.2]), 0.0, 0.5));
1464 assert!(approx(knn.classify(&[9.8]), 1.0, 0.5));
1465 }
1466 #[test]
1467 fn test_knn_model_regress() {
1468 let mut knn = KnnModel::new(2, KnnMetric::Euclidean, false);
1469 knn.fit(vec![vec![0.0], vec![1.0], vec![2.0]], vec![0.0, 1.0, 2.0]);
1470 let pred = knn.regress(&[1.0]);
1471 assert!(approx(pred, 1.0, 0.6));
1472 }
1473 #[test]
1474 fn test_knn_model_weighted_classify() {
1475 let mut knn = KnnModel::new(3, KnnMetric::Euclidean, true);
1476 knn.fit(
1477 vec![vec![0.0], vec![1.0], vec![2.0], vec![10.0]],
1478 vec![0.0, 0.0, 0.0, 1.0],
1479 );
1480 let pred = knn.classify(&[0.1]);
1481 assert!(approx(pred, 0.0, 0.5));
1482 }
1483 #[test]
1484 fn test_knn_model_manhattan() {
1485 let mut knn = KnnModel::new(1, KnnMetric::Manhattan, false);
1486 knn.fit(vec![vec![0.0, 0.0], vec![3.0, 4.0]], vec![0.0, 1.0]);
1487 assert!(approx(knn.classify(&[0.1, 0.1]), 0.0, 0.5));
1488 }
1489 #[test]
1490 fn test_rbf_kernel_self() {
1491 let x = vec![1.0, 2.0, 3.0];
1492 let k = rbf_kernel(&x, &x, 1.0, 1.0);
1493 assert!(approx(k, 1.0, 1e-10));
1494 }
1495 #[test]
1496 fn test_rbf_kernel_decay() {
1497 let x1 = vec![0.0];
1498 let x2 = vec![1.0];
1499 let x3 = vec![5.0];
1500 let k12 = rbf_kernel(&x1, &x2, 1.0, 1.0);
1501 let k13 = rbf_kernel(&x1, &x3, 1.0, 1.0);
1502 assert!(k12 > k13);
1503 }
1504 #[test]
1505 fn test_gp_regression_interpolation() {
1506 let x_train: Vec<Vec<f64>> = vec![vec![0.0], vec![1.0], vec![2.0]];
1507 let y_train = vec![0.0, 1.0, 0.0];
1508 let gp = GaussianProcessRegressor::fit(x_train.clone(), y_train.clone(), 1.0, 1.0, 1e-3);
1509 let (means, _vars) = gp.predict(&x_train);
1510 for (m, t) in means.iter().zip(y_train.iter()) {
1511 assert!(approx(*m, *t, 0.3), "m={m}, t={t}");
1512 }
1513 }
1514 #[test]
1515 fn test_gp_variance_non_negative() {
1516 let x_train = vec![vec![0.0], vec![1.0]];
1517 let y_train = vec![0.0, 1.0];
1518 let gp = GaussianProcessRegressor::fit(x_train, y_train, 1.0, 1.0, 0.1);
1519 let x_test = vec![vec![0.5], vec![2.0]];
1520 let (_means, vars) = gp.predict(&x_test);
1521 for &v in &vars {
1522 assert!(v >= 0.0, "variance={v}");
1523 }
1524 }
1525 #[test]
1526 fn test_gp_log_marginal_likelihood() {
1527 let x_train = vec![vec![0.0], vec![1.0], vec![2.0]];
1528 let y_train = vec![0.0, 1.0, 4.0];
1529 let gp = GaussianProcessRegressor::fit(x_train, y_train, 1.0, 1.0, 0.1);
1530 let lml = gp.log_marginal_likelihood();
1531 assert!(lml.is_finite(), "lml={lml}");
1532 }
1533 #[test]
1534 fn test_isolation_forest_outlier_score() {
1535 let inliers: Vec<Vec<f64>> = (0..20).map(|i| vec![i as f64 * 0.1]).collect();
1536 let outlier = vec![vec![100.0_f64]];
1537 let mut all_data = inliers.clone();
1538 all_data.extend(outlier.clone());
1539 let iso = IsolationForest::fit(&all_data, 20, 10);
1540 let inlier_score = iso.anomaly_score(&inliers[0]);
1541 let outlier_score = iso.anomaly_score(&outlier[0]);
1542 assert!(
1543 outlier_score > inlier_score,
1544 "outlier={outlier_score}, inlier={inlier_score}"
1545 );
1546 }
1547 #[test]
1548 fn test_isolation_forest_score_range() {
1549 let data: Vec<Vec<f64>> = (0..10).map(|i| vec![i as f64]).collect();
1550 let iso = IsolationForest::fit(&data, 5, 5);
1551 let scores = iso.score_samples(&data);
1552 for &s in &scores {
1553 assert!(s > 0.0 && s <= 1.0, "score={s}");
1554 }
1555 }
1556 #[test]
1557 fn test_avg_path_length_grows() {
1558 assert!(average_path_length(10) > average_path_length(2));
1559 assert!(average_path_length(100) > average_path_length(10));
1560 }
1561 #[test]
1562 fn test_lof_compact_cluster() {
1563 let data: Vec<Vec<f64>> = (0..10).map(|i| vec![i as f64 * 0.01]).collect();
1564 let scores = lof_scores(&data, 3);
1565 for &s in &scores {
1566 assert!(s < 3.0, "lof={s}");
1567 }
1568 }
1569 #[test]
1570 fn test_lof_outlier() {
1571 let mut data: Vec<Vec<f64>> = (0..10).map(|i| vec![i as f64 * 0.1]).collect();
1572 data.push(vec![100.0]);
1573 let scores = lof_scores(&data, 3);
1574 let outlier_score = *scores.last().unwrap();
1575 let inlier_score = scores[0];
1576 assert!(
1577 outlier_score > inlier_score,
1578 "outlier={outlier_score}, inlier={inlier_score}"
1579 );
1580 }
1581 #[test]
1582 fn test_tsne_embedding_size() {
1583 let x: Vec<Vec<f64>> = (0..10).map(|i| vec![i as f64, (i % 3) as f64]).collect();
1584 let result = tsne(&x, 3.0, 20, 100.0);
1585 assert_eq!(result.embedding.len(), 10);
1586 }
1587 #[test]
1588 fn test_multiclass_prf_perfect() {
1589 let cm = vec![vec![5usize, 0, 0], vec![0, 4, 0], vec![0, 0, 3]];
1590 let prf = multiclass_prf(&cm);
1591 for (_, p, r, f1) in &prf {
1592 assert!(approx(*p, 1.0, 1e-10));
1593 assert!(approx(*r, 1.0, 1e-10));
1594 assert!(approx(*f1, 1.0, 1e-10));
1595 }
1596 }
1597 #[test]
1598 fn test_roc_auc_trapezoid() {
1599 let scores = vec![0.9, 0.8, 0.4, 0.2];
1600 let labels = vec![1usize, 1, 0, 0];
1601 let (fpr, tpr) = roc_curve(&scores, &labels);
1602 let auc = auc_trapezoid(&fpr, &tpr);
1603 assert!(approx(auc, 1.0, 1e-6), "auc={auc}");
1604 }
1605}