ghostflow_ml/
bayesian.rs

1//! Bayesian Linear Models - Bayesian Ridge, ARD Regression
2
3use ghostflow_core::Tensor;
4
5/// Bayesian Ridge Regression with automatic relevance determination
6pub struct BayesianRidge {
7    pub n_iter: usize,
8    pub tol: f32,
9    pub alpha_1: f32,
10    pub alpha_2: f32,
11    pub lambda_1: f32,
12    pub lambda_2: f32,
13    pub fit_intercept: bool,
14    coef_: Option<Vec<f32>>,
15    intercept_: Option<f32>,
16    alpha_: Option<f32>,
17    lambda_: Option<f32>,
18    sigma_: Option<Vec<f32>>,
19    n_iter_: usize,
20}
21
22impl BayesianRidge {
23    pub fn new() -> Self {
24        BayesianRidge {
25            n_iter: 300,
26            tol: 1e-3,
27            alpha_1: 1e-6,
28            alpha_2: 1e-6,
29            lambda_1: 1e-6,
30            lambda_2: 1e-6,
31            fit_intercept: true,
32            coef_: None,
33            intercept_: None,
34            alpha_: None,
35            lambda_: None,
36            sigma_: None,
37            n_iter_: 0,
38        }
39    }
40
41    pub fn n_iter(mut self, n: usize) -> Self {
42        self.n_iter = n;
43        self
44    }
45
46    pub fn fit(&mut self, x: &Tensor, y: &Tensor) {
47        let x_data = x.data_f32();
48        let y_data = y.data_f32();
49        let n_samples = x.dims()[0];
50        let n_features = x.dims()[1];
51
52        // Center data if fitting intercept
53        let (x_centered, y_centered, x_mean, y_mean) = if self.fit_intercept {
54            let x_mean: Vec<f32> = (0..n_features)
55                .map(|j| (0..n_samples).map(|i| x_data[i * n_features + j]).sum::<f32>() / n_samples as f32)
56                .collect();
57            let y_mean = y_data.iter().sum::<f32>() / n_samples as f32;
58
59            let x_centered: Vec<f32> = (0..n_samples)
60                .flat_map(|i| (0..n_features).map(|j| x_data[i * n_features + j] - x_mean[j]).collect::<Vec<_>>())
61                .collect();
62            let y_centered: Vec<f32> = y_data.iter().map(|&y| y - y_mean).collect();
63
64            (x_centered, y_centered, x_mean, y_mean)
65        } else {
66            (x_data.clone(), y_data.clone(), vec![0.0; n_features], 0.0)
67        };
68
69        // Initialize hyperparameters
70        let mut alpha = 1.0f32;  // Noise precision
71        let mut lambda = 1.0f32; // Weight precision
72
73        // Compute X^T X
74        let mut xtx = vec![0.0f32; n_features * n_features];
75        for i in 0..n_features {
76            for j in 0..n_features {
77                for k in 0..n_samples {
78                    xtx[i * n_features + j] += x_centered[k * n_features + i] * x_centered[k * n_features + j];
79                }
80            }
81        }
82
83        // Compute X^T y
84        let mut xty = vec![0.0f32; n_features];
85        for i in 0..n_features {
86            for k in 0..n_samples {
87                xty[i] += x_centered[k * n_features + i] * y_centered[k];
88            }
89        }
90
91        let mut coef = vec![0.0f32; n_features];
92        let mut sigma = vec![0.0f32; n_features * n_features];
93
94        for iter in 0..self.n_iter {
95            let alpha_old = alpha;
96            let lambda_old = lambda;
97
98            // Compute posterior covariance: Sigma = (alpha * X^T X + lambda * I)^-1
99            let mut a = xtx.clone();
100            for i in 0..n_features {
101                a[i * n_features + i] += lambda / alpha;
102            }
103
104            // Cholesky decomposition and solve
105            sigma = self.invert_matrix(&a, n_features);
106
107            // Compute posterior mean: m = alpha * Sigma * X^T y
108            for i in 0..n_features {
109                coef[i] = 0.0;
110                for j in 0..n_features {
111                    coef[i] += sigma[i * n_features + j] * xty[j];
112                }
113            }
114
115            // Update alpha (noise precision)
116            let mut residual_sum = 0.0f32;
117            for i in 0..n_samples {
118                let mut pred = 0.0f32;
119                for j in 0..n_features {
120                    pred += x_centered[i * n_features + j] * coef[j];
121                }
122                residual_sum += (y_centered[i] - pred).powi(2);
123            }
124
125            // Compute effective number of parameters
126            let mut gamma = 0.0f32;
127            for i in 0..n_features {
128                let mut eigenvalue_approx = 0.0f32;
129                for j in 0..n_features {
130                    eigenvalue_approx += xtx[i * n_features + j] * sigma[j * n_features + i];
131                }
132                gamma += alpha * eigenvalue_approx;
133            }
134            gamma = gamma.clamp(0.0, n_features as f32);
135
136            // Update hyperparameters
137            alpha = (n_samples as f32 - gamma + 2.0 * self.alpha_1) / 
138                    (residual_sum + 2.0 * self.alpha_2);
139
140            let coef_sq_sum: f32 = coef.iter().map(|&c| c * c).sum();
141            lambda = (gamma + 2.0 * self.lambda_1) / (coef_sq_sum + 2.0 * self.lambda_2);
142
143            self.n_iter_ = iter + 1;
144
145            // Check convergence
146            if (alpha - alpha_old).abs() < self.tol && (lambda - lambda_old).abs() < self.tol {
147                break;
148            }
149        }
150
151        // Compute intercept
152        let intercept = if self.fit_intercept {
153            let mut int = y_mean;
154            for j in 0..n_features {
155                int -= coef[j] * x_mean[j];
156            }
157            int
158        } else {
159            0.0
160        };
161
162        self.coef_ = Some(coef);
163        self.intercept_ = Some(intercept);
164        self.alpha_ = Some(alpha);
165        self.lambda_ = Some(lambda);
166        self.sigma_ = Some(sigma);
167    }
168
169    fn invert_matrix(&self, a: &[f32], n: usize) -> Vec<f32> {
170        let mut l = vec![0.0f32; n * n];
171        
172        // Cholesky decomposition
173        for i in 0..n {
174            for j in 0..=i {
175                let mut sum = 0.0f32;
176                if i == j {
177                    for k in 0..j {
178                        sum += l[j * n + k] * l[j * n + k];
179                    }
180                    l[j * n + j] = (a[j * n + j] - sum).max(1e-10).sqrt();
181                } else {
182                    for k in 0..j {
183                        sum += l[i * n + k] * l[j * n + k];
184                    }
185                    l[i * n + j] = (a[i * n + j] - sum) / l[j * n + j].max(1e-10);
186                }
187            }
188        }
189
190        // Invert L
191        let mut l_inv = vec![0.0f32; n * n];
192        for i in 0..n {
193            l_inv[i * n + i] = 1.0 / l[i * n + i].max(1e-10);
194            for j in (i + 1)..n {
195                let mut sum = 0.0f32;
196                for k in i..j {
197                    sum += l[j * n + k] * l_inv[k * n + i];
198                }
199                l_inv[j * n + i] = -sum / l[j * n + j].max(1e-10);
200            }
201        }
202
203        // A^-1 = L_inv^T * L_inv
204        let mut inv = vec![0.0f32; n * n];
205        for i in 0..n {
206            for j in 0..n {
207                for k in 0..n {
208                    inv[i * n + j] += l_inv[k * n + i] * l_inv[k * n + j];
209                }
210            }
211        }
212
213        inv
214    }
215
216    pub fn predict(&self, x: &Tensor) -> Tensor {
217        let x_data = x.data_f32();
218        let n_samples = x.dims()[0];
219        let n_features = x.dims()[1];
220
221        let coef = self.coef_.as_ref().expect("Model not fitted");
222        let intercept = self.intercept_.unwrap_or(0.0);
223
224        let predictions: Vec<f32> = (0..n_samples)
225            .map(|i| {
226                let mut pred = intercept;
227                for j in 0..n_features {
228                    pred += coef[j] * x_data[i * n_features + j];
229                }
230                pred
231            })
232            .collect();
233
234        Tensor::from_slice(&predictions, &[n_samples]).unwrap()
235    }
236
237    pub fn predict_with_std(&self, x: &Tensor) -> (Tensor, Tensor) {
238        let x_data = x.data_f32();
239        let n_samples = x.dims()[0];
240        let n_features = x.dims()[1];
241
242        let coef = self.coef_.as_ref().expect("Model not fitted");
243        let intercept = self.intercept_.unwrap_or(0.0);
244        let sigma = self.sigma_.as_ref().expect("Model not fitted");
245        let alpha = self.alpha_.unwrap_or(1.0);
246
247        let mut predictions = Vec::with_capacity(n_samples);
248        let mut stds = Vec::with_capacity(n_samples);
249
250        for i in 0..n_samples {
251            let xi = &x_data[i * n_features..(i + 1) * n_features];
252            
253            let mut pred = intercept;
254            for j in 0..n_features {
255                pred += coef[j] * xi[j];
256            }
257            predictions.push(pred);
258
259            // Predictive variance: 1/alpha + x^T Sigma x
260            let mut var = 1.0 / alpha;
261            for j in 0..n_features {
262                for k in 0..n_features {
263                    var += xi[j] * sigma[j * n_features + k] * xi[k];
264                }
265            }
266            stds.push(var.sqrt());
267        }
268
269        (
270            Tensor::from_slice(&predictions, &[n_samples]).unwrap(),
271            Tensor::from_slice(&stds, &[n_samples]).unwrap(),
272        )
273    }
274
275    pub fn score(&self, x: &Tensor, y: &Tensor) -> f32 {
276        let predictions = self.predict(x);
277        let pred_data = predictions.data_f32();
278        let y_data = y.data_f32();
279
280        let y_mean: f32 = y_data.iter().sum::<f32>() / y_data.len() as f32;
281        let ss_res: f32 = pred_data.iter().zip(y_data.iter()).map(|(&p, &y)| (y - p).powi(2)).sum();
282        let ss_tot: f32 = y_data.iter().map(|&y| (y - y_mean).powi(2)).sum();
283
284        1.0 - ss_res / ss_tot.max(1e-10)
285    }
286}
287
288impl Default for BayesianRidge {
289    fn default() -> Self {
290        Self::new()
291    }
292}
293
294
295/// Automatic Relevance Determination Regression
296pub struct ARDRegression {
297    pub n_iter: usize,
298    pub tol: f32,
299    pub alpha_1: f32,
300    pub alpha_2: f32,
301    pub lambda_1: f32,
302    pub lambda_2: f32,
303    pub threshold_lambda: f32,
304    pub fit_intercept: bool,
305    coef_: Option<Vec<f32>>,
306    intercept_: Option<f32>,
307    alpha_: Option<f32>,
308    lambda_: Option<Vec<f32>>,
309    sigma_: Option<Vec<f32>>,
310    n_iter_: usize,
311}
312
313impl ARDRegression {
314    pub fn new() -> Self {
315        ARDRegression {
316            n_iter: 300,
317            tol: 1e-3,
318            alpha_1: 1e-6,
319            alpha_2: 1e-6,
320            lambda_1: 1e-6,
321            lambda_2: 1e-6,
322            threshold_lambda: 1e4,
323            fit_intercept: true,
324            coef_: None,
325            intercept_: None,
326            alpha_: None,
327            lambda_: None,
328            sigma_: None,
329            n_iter_: 0,
330        }
331    }
332
333    pub fn n_iter(mut self, n: usize) -> Self {
334        self.n_iter = n;
335        self
336    }
337
338    pub fn fit(&mut self, x: &Tensor, y: &Tensor) {
339        let x_data = x.data_f32();
340        let y_data = y.data_f32();
341        let n_samples = x.dims()[0];
342        let n_features = x.dims()[1];
343
344        // Center data
345        let (x_centered, y_centered, x_mean, y_mean) = if self.fit_intercept {
346            let x_mean: Vec<f32> = (0..n_features)
347                .map(|j| (0..n_samples).map(|i| x_data[i * n_features + j]).sum::<f32>() / n_samples as f32)
348                .collect();
349            let y_mean = y_data.iter().sum::<f32>() / n_samples as f32;
350
351            let x_centered: Vec<f32> = (0..n_samples)
352                .flat_map(|i| (0..n_features).map(|j| x_data[i * n_features + j] - x_mean[j]).collect::<Vec<_>>())
353                .collect();
354            let y_centered: Vec<f32> = y_data.iter().map(|&y| y - y_mean).collect();
355
356            (x_centered, y_centered, x_mean, y_mean)
357        } else {
358            (x_data.clone(), y_data.clone(), vec![0.0; n_features], 0.0)
359        };
360
361        // Initialize
362        let mut alpha = 1.0f32;
363        let mut lambda = vec![1.0f32; n_features];  // Per-feature precision
364
365        // Compute X^T X and X^T y
366        let mut xtx = vec![0.0f32; n_features * n_features];
367        let mut xty = vec![0.0f32; n_features];
368        
369        for i in 0..n_features {
370            for k in 0..n_samples {
371                xty[i] += x_centered[k * n_features + i] * y_centered[k];
372            }
373            for j in 0..n_features {
374                for k in 0..n_samples {
375                    xtx[i * n_features + j] += x_centered[k * n_features + i] * x_centered[k * n_features + j];
376                }
377            }
378        }
379
380        let mut coef = vec![0.0f32; n_features];
381        let mut sigma = vec![0.0f32; n_features * n_features];
382
383        for iter in 0..self.n_iter {
384            let alpha_old = alpha;
385            let lambda_old = lambda.clone();
386
387            // Compute posterior covariance: Sigma = (alpha * X^T X + diag(lambda))^-1
388            let mut a = xtx.clone();
389            for i in 0..n_features {
390                a[i * n_features + i] += lambda[i] / alpha;
391            }
392
393            sigma = self.invert_matrix(&a, n_features);
394
395            // Compute posterior mean
396            for i in 0..n_features {
397                coef[i] = 0.0;
398                for j in 0..n_features {
399                    coef[i] += sigma[i * n_features + j] * xty[j];
400                }
401            }
402
403            // Update alpha
404            let mut residual_sum = 0.0f32;
405            for i in 0..n_samples {
406                let mut pred = 0.0f32;
407                for j in 0..n_features {
408                    pred += x_centered[i * n_features + j] * coef[j];
409                }
410                residual_sum += (y_centered[i] - pred).powi(2);
411            }
412
413            let mut gamma = 0.0f32;
414            for i in 0..n_features {
415                gamma += 1.0 - lambda[i] * sigma[i * n_features + i];
416            }
417            gamma = gamma.clamp(0.0, n_features as f32);
418
419            alpha = (n_samples as f32 - gamma + 2.0 * self.alpha_1) / 
420                    (residual_sum + 2.0 * self.alpha_2);
421
422            // Update per-feature lambda (ARD)
423            for i in 0..n_features {
424                let gamma_i = 1.0 - lambda[i] * sigma[i * n_features + i];
425                lambda[i] = (gamma_i + 2.0 * self.lambda_1) / 
426                           (coef[i] * coef[i] + 2.0 * self.lambda_2);
427                
428                // Threshold very large lambda (feature pruning)
429                if lambda[i] > self.threshold_lambda {
430                    lambda[i] = self.threshold_lambda;
431                }
432            }
433
434            self.n_iter_ = iter + 1;
435
436            // Check convergence
437            let alpha_diff = (alpha - alpha_old).abs();
438            let lambda_diff: f32 = lambda.iter().zip(lambda_old.iter())
439                .map(|(&l, &lo)| (l - lo).abs())
440                .sum::<f32>() / n_features as f32;
441
442            if alpha_diff < self.tol && lambda_diff < self.tol {
443                break;
444            }
445        }
446
447        // Compute intercept
448        let intercept = if self.fit_intercept {
449            let mut int = y_mean;
450            for j in 0..n_features {
451                int -= coef[j] * x_mean[j];
452            }
453            int
454        } else {
455            0.0
456        };
457
458        self.coef_ = Some(coef);
459        self.intercept_ = Some(intercept);
460        self.alpha_ = Some(alpha);
461        self.lambda_ = Some(lambda);
462        self.sigma_ = Some(sigma);
463    }
464
465    fn invert_matrix(&self, a: &[f32], n: usize) -> Vec<f32> {
466        let mut l = vec![0.0f32; n * n];
467        
468        for i in 0..n {
469            for j in 0..=i {
470                let mut sum = 0.0f32;
471                if i == j {
472                    for k in 0..j {
473                        sum += l[j * n + k] * l[j * n + k];
474                    }
475                    l[j * n + j] = (a[j * n + j] - sum).max(1e-10).sqrt();
476                } else {
477                    for k in 0..j {
478                        sum += l[i * n + k] * l[j * n + k];
479                    }
480                    l[i * n + j] = (a[i * n + j] - sum) / l[j * n + j].max(1e-10);
481                }
482            }
483        }
484
485        let mut l_inv = vec![0.0f32; n * n];
486        for i in 0..n {
487            l_inv[i * n + i] = 1.0 / l[i * n + i].max(1e-10);
488            for j in (i + 1)..n {
489                let mut sum = 0.0f32;
490                for k in i..j {
491                    sum += l[j * n + k] * l_inv[k * n + i];
492                }
493                l_inv[j * n + i] = -sum / l[j * n + j].max(1e-10);
494            }
495        }
496
497        let mut inv = vec![0.0f32; n * n];
498        for i in 0..n {
499            for j in 0..n {
500                for k in 0..n {
501                    inv[i * n + j] += l_inv[k * n + i] * l_inv[k * n + j];
502                }
503            }
504        }
505
506        inv
507    }
508
509    pub fn predict(&self, x: &Tensor) -> Tensor {
510        let x_data = x.data_f32();
511        let n_samples = x.dims()[0];
512        let n_features = x.dims()[1];
513
514        let coef = self.coef_.as_ref().expect("Model not fitted");
515        let intercept = self.intercept_.unwrap_or(0.0);
516
517        let predictions: Vec<f32> = (0..n_samples)
518            .map(|i| {
519                let mut pred = intercept;
520                for j in 0..n_features {
521                    pred += coef[j] * x_data[i * n_features + j];
522                }
523                pred
524            })
525            .collect();
526
527        Tensor::from_slice(&predictions, &[n_samples]).unwrap()
528    }
529
530    pub fn score(&self, x: &Tensor, y: &Tensor) -> f32 {
531        let predictions = self.predict(x);
532        let pred_data = predictions.data_f32();
533        let y_data = y.data_f32();
534
535        let y_mean: f32 = y_data.iter().sum::<f32>() / y_data.len() as f32;
536        let ss_res: f32 = pred_data.iter().zip(y_data.iter()).map(|(&p, &y)| (y - p).powi(2)).sum();
537        let ss_tot: f32 = y_data.iter().map(|&y| (y - y_mean).powi(2)).sum();
538
539        1.0 - ss_res / ss_tot.max(1e-10)
540    }
541}
542
543impl Default for ARDRegression {
544    fn default() -> Self {
545        Self::new()
546    }
547}
548
549#[cfg(test)]
550mod tests {
551    use super::*;
552
553    #[test]
554    fn test_bayesian_ridge() {
555        let x = Tensor::from_slice(&[1.0, 2.0, 3.0, 4.0, 5.0], &[5, 1]).unwrap();
556        let y = Tensor::from_slice(&[2.0, 4.0, 6.0, 8.0, 10.0], &[5]).unwrap();
557
558        let mut br = BayesianRidge::new().n_iter(100);
559        br.fit(&x, &y);
560
561        let score = br.score(&x, &y);
562        assert!(score > 0.9);
563    }
564
565    #[test]
566    fn test_ard_regression() {
567        let x = Tensor::from_slice(&[
568            1.0, 0.0,
569            2.0, 0.0,
570            3.0, 0.0,
571            4.0, 0.0,
572        ], &[4, 2]).unwrap();
573        let y = Tensor::from_slice(&[2.0, 4.0, 6.0, 8.0], &[4]).unwrap();
574
575        let mut ard = ARDRegression::new().n_iter(100);
576        ard.fit(&x, &y);
577
578        let predictions = ard.predict(&x);
579        assert_eq!(predictions.dims(), &[4]);
580    }
581}