Skip to main content

oxilean_std/statistical_learning/
types.rs

1//! Auto-generated module
2//!
3//! 🤖 Generated with [SplitRS](https://github.com/cool-japan/splitrs)
4
5use super::functions::*;
6/// Growth function Π_H(m) for a hypothesis class.
7pub struct GrowthFunction {
8    /// VC dimension of the hypothesis class.
9    pub vc_dim: usize,
10}
11impl GrowthFunction {
12    /// Create a new GrowthFunction.
13    pub fn new(vc_dim: usize) -> Self {
14        Self { vc_dim }
15    }
16    /// Evaluate the Sauer-Shelah upper bound for Π_H(m).
17    pub fn evaluate(&self, m: usize) -> usize {
18        VCDimension::new(self.vc_dim).sauer_shelah_bound(m)
19    }
20}
21/// Cross-validation scheme.
22#[allow(dead_code)]
23#[derive(Debug, Clone)]
24pub struct CrossValidation {
25    pub n_folds: usize,
26    pub n_samples: usize,
27    pub shuffle: bool,
28    pub stratified: bool,
29}
30#[allow(dead_code)]
31impl CrossValidation {
32    pub fn new(k: usize, n: usize) -> Self {
33        CrossValidation {
34            n_folds: k,
35            n_samples: n,
36            shuffle: true,
37            stratified: false,
38        }
39    }
40    pub fn k_fold_5(n: usize) -> Self {
41        CrossValidation::new(5, n)
42    }
43    pub fn loocv(n: usize) -> Self {
44        CrossValidation::new(n, n)
45    }
46    pub fn fold_size(&self) -> usize {
47        (self.n_samples + self.n_folds - 1) / self.n_folds
48    }
49    pub fn train_size(&self) -> usize {
50        self.n_samples - self.fold_size()
51    }
52    pub fn n_train_test_splits(&self) -> usize {
53        self.n_folds
54    }
55}
56/// Early stopping regularization — implicit regularization by iteration count.
57pub struct EarlyStoppingReg {
58    /// Maximum number of gradient descent iterations.
59    pub max_iters: usize,
60    /// Step size.
61    pub step_size: f64,
62}
63impl EarlyStoppingReg {
64    /// Create a new EarlyStoppingReg.
65    pub fn new(max_iters: usize, step_size: f64) -> Self {
66        Self {
67            max_iters,
68            step_size,
69        }
70    }
71    /// Effective regularization parameter ≈ 1/(step_size * max_iters).
72    pub fn effective_lambda(&self) -> f64 {
73        1.0 / (self.step_size * self.max_iters as f64)
74    }
75}
76/// AdaBoost: adaptive boosting with exponential loss.
77pub struct AdaBoost {
78    /// Number of boosting rounds T.
79    pub rounds: usize,
80    /// Weights α_t for each weak learner.
81    pub alphas: Vec<f64>,
82    /// Per-round weak learner accuracies.
83    pub weak_accuracies: Vec<f64>,
84}
85impl AdaBoost {
86    /// Create a new AdaBoost instance.
87    pub fn new(rounds: usize) -> Self {
88        Self {
89            rounds,
90            alphas: Vec::new(),
91            weak_accuracies: Vec::new(),
92        }
93    }
94    /// Compute alpha_t = 0.5 * ln((1 - ε_t) / ε_t) for a weak learner with error ε_t.
95    pub fn compute_alpha(weak_error: f64) -> f64 {
96        0.5 * ((1.0 - weak_error) / weak_error).ln()
97    }
98    /// Training error bound after T rounds: ≤ exp(-2 Σ γ_t²) where γ_t = 0.5 - ε_t.
99    pub fn training_error_bound(gammas: &[f64]) -> f64 {
100        let sum_gamma_sq: f64 = gammas.iter().map(|g| g * g).sum();
101        (-2.0 * sum_gamma_sq).exp()
102    }
103    /// Record a round's weak learner accuracy.
104    pub fn add_round(&mut self, weak_accuracy: f64) {
105        let weak_error = 1.0 - weak_accuracy;
106        let alpha = Self::compute_alpha(weak_error);
107        self.alphas.push(alpha);
108        self.weak_accuracies.push(weak_accuracy);
109    }
110}
111/// Online Gradient Descent with regret bound O(√T).
112pub struct OnlineGradientDescent {
113    /// Current parameter vector w_t.
114    pub weights: Vec<f64>,
115    /// Learning rate η.
116    pub eta: f64,
117    /// Constraint set radius D (‖w‖ ≤ D).
118    pub d: f64,
119    /// Gradient norm bound G (‖∇_t‖ ≤ G).
120    pub g: f64,
121    /// Round count.
122    pub t: usize,
123}
124impl OnlineGradientDescent {
125    /// Create a new OGD instance.
126    pub fn new(dim: usize, eta: f64, d: f64, g: f64) -> Self {
127        Self {
128            weights: vec![0.0; dim],
129            eta,
130            d,
131            g,
132            t: 0,
133        }
134    }
135    /// Update: w_{t+1} = project(w_t - η ∇_t) onto ‖w‖ ≤ D.
136    pub fn update(&mut self, gradient: &[f64]) {
137        for (wi, &gi) in self.weights.iter_mut().zip(gradient.iter()) {
138            *wi -= self.eta * gi;
139        }
140        let norm: f64 = self.weights.iter().map(|wi| wi * wi).sum::<f64>().sqrt();
141        if norm > self.d {
142            let scale = self.d / norm;
143            for wi in self.weights.iter_mut() {
144                *wi *= scale;
145            }
146        }
147        self.t += 1;
148    }
149    /// Regret bound after T rounds: R_T ≤ D * G * √(2T).
150    pub fn regret_bound(&self) -> f64 {
151        self.d * self.g * (2.0 * self.t as f64).sqrt()
152    }
153}
154/// Support Vector Machine classifier.
155#[allow(dead_code)]
156#[derive(Debug, Clone)]
157pub struct SVMClassifier {
158    pub kernel_type: SVMKernel,
159    pub c_regularization: f64,
160    pub n_support_vectors: usize,
161}
162#[allow(dead_code)]
163impl SVMClassifier {
164    pub fn new(kernel: SVMKernel, c: f64) -> Self {
165        SVMClassifier {
166            kernel_type: kernel,
167            c_regularization: c,
168            n_support_vectors: 0,
169        }
170    }
171    pub fn linear(c: f64) -> Self {
172        SVMClassifier::new(SVMKernel::Linear, c)
173    }
174    pub fn rbf(gamma: f64, c: f64) -> Self {
175        SVMClassifier::new(SVMKernel::RBF(gamma), c)
176    }
177    pub fn kernel_value(&self, x: &[f64], xp: &[f64]) -> f64 {
178        match &self.kernel_type {
179            SVMKernel::Linear => dot(x, xp),
180            SVMKernel::Polynomial(d) => (dot(x, xp) + 1.0).powi(*d as i32),
181            SVMKernel::RBF(gamma) => {
182                let sq_dist = x
183                    .iter()
184                    .zip(xp.iter())
185                    .map(|(a, b)| (a - b).powi(2))
186                    .sum::<f64>();
187                (-gamma * sq_dist).exp()
188            }
189            SVMKernel::Sigmoid => (dot(x, xp)).tanh(),
190        }
191    }
192    pub fn sparsity_ratio(&self, n_training: usize) -> f64 {
193        if n_training == 0 {
194            return 0.0;
195        }
196        self.n_support_vectors as f64 / n_training as f64
197    }
198}
199/// Fisher information I(θ) = E\[(∂/∂θ log p(x;θ))²\].
200pub struct FisherInformation {
201    /// Log-density function log p(x; θ) as a closure index (stored as parameter).
202    pub theta: f64,
203}
204impl FisherInformation {
205    /// Create a new FisherInformation at parameter θ.
206    pub fn new(theta: f64) -> Self {
207        Self { theta }
208    }
209    /// Numerical estimate of Fisher information via finite-difference score.
210    ///
211    /// Given samples x_i and log_density function, I(θ) ≈ (1/n) Σ (∂/∂θ log p(x_i;θ))².
212    pub fn estimate(log_density: impl Fn(f64, f64) -> f64, theta: f64, samples: &[f64]) -> f64 {
213        let h = 1e-5;
214        let n = samples.len() as f64;
215        samples
216            .iter()
217            .map(|&x| {
218                let score = (log_density(x, theta + h) - log_density(x, theta - h)) / (2.0 * h);
219                score * score
220            })
221            .sum::<f64>()
222            / n
223    }
224    /// Cramér-Rao lower bound: Var(θ̂) ≥ 1/I(θ).
225    pub fn cramer_rao_bound(&self, fisher_val: f64) -> f64 {
226        if fisher_val > 0.0 {
227            1.0 / fisher_val
228        } else {
229            f64::INFINITY
230        }
231    }
232}
233/// Rademacher complexity estimate for a finite hypothesis class.
234///
235/// For a finite class H of size |H| over n samples:
236/// R_n(H) ≤ √(2 ln|H| / n).
237pub struct RademacherComplexity {
238    /// Number of samples n.
239    pub n: usize,
240    /// Number of hypotheses in the class |H|.
241    pub class_size: usize,
242}
243impl RademacherComplexity {
244    /// Create a new RademacherComplexity bound.
245    pub fn new(n: usize, class_size: usize) -> Self {
246        Self { n, class_size }
247    }
248    /// Upper bound: √(2 ln|H| / n).
249    pub fn upper_bound(&self) -> f64 {
250        (2.0 * (self.class_size as f64).ln() / self.n as f64).sqrt()
251    }
252    /// Generalization bound: L_D(h) ≤ L_S(h) + 2 R_n(H) + √(log(2/δ)/(2n)).
253    pub fn generalization_bound(&self, empirical_loss: f64, delta: f64) -> f64 {
254        let rn = self.upper_bound();
255        let confidence_term = ((2.0 / delta).ln() / (2.0 * self.n as f64)).sqrt();
256        empirical_loss + 2.0 * rn + confidence_term
257    }
258}
259/// Uniform convergence checker.
260pub struct UniformConvergence {
261    /// ε: uniform convergence tolerance.
262    pub eps: f64,
263    /// δ: failure probability.
264    pub delta: f64,
265}
266impl UniformConvergence {
267    /// Create a new UniformConvergence instance.
268    pub fn new(eps: f64, delta: f64) -> Self {
269        Self { eps, delta }
270    }
271    /// Required samples for ε-uniform convergence for a class of size |H|.
272    pub fn required_samples(&self, class_size: usize) -> usize {
273        let log_h = (class_size as f64).ln();
274        let log_delta = (1.0 / self.delta).ln();
275        ((2.0 * (log_h + log_delta)) / (self.eps * self.eps)).ceil() as usize
276    }
277}
278/// Bias-variance tradeoff decomposition: MSE = Bias² + Variance + Noise.
279pub struct BiasVarianceTradeoff {
280    /// Squared bias of the estimator.
281    pub bias_squared: f64,
282    /// Variance of the estimator.
283    pub variance: f64,
284    /// Irreducible noise level σ².
285    pub noise: f64,
286}
287impl BiasVarianceTradeoff {
288    /// Create a new BiasVarianceTradeoff.
289    pub fn new(bias_squared: f64, variance: f64, noise: f64) -> Self {
290        Self {
291            bias_squared,
292            variance,
293            noise,
294        }
295    }
296    /// Total expected MSE = Bias² + Var + σ².
297    pub fn total_mse(&self) -> f64 {
298        self.bias_squared + self.variance + self.noise
299    }
300}
301/// Mutual information I(X;Y) = H(X) + H(Y) - H(X,Y).
302pub struct MutualInformation;
303impl MutualInformation {
304    /// Compute I(X;Y) from a joint distribution table.
305    ///
306    /// `joint\[i\]\[j\]` = P(X=i, Y=j).
307    pub fn compute(joint: &[Vec<f64>]) -> f64 {
308        if joint.is_empty() {
309            return 0.0;
310        }
311        let _n_rows = joint.len();
312        let n_cols = joint[0].len();
313        let px: Vec<f64> = joint.iter().map(|row| row.iter().sum::<f64>()).collect();
314        let py: Vec<f64> = (0..n_cols)
315            .map(|j| {
316                joint
317                    .iter()
318                    .map(|row| row.get(j).copied().unwrap_or(0.0))
319                    .sum()
320            })
321            .collect();
322        let h_x: f64 = px.iter().filter(|&&p| p > 0.0).map(|&p| -p * p.ln()).sum();
323        let h_y: f64 = py.iter().filter(|&&p| p > 0.0).map(|&p| -p * p.ln()).sum();
324        let h_xy: f64 = joint
325            .iter()
326            .flat_map(|row| row.iter())
327            .filter(|&&p| p > 0.0)
328            .map(|&p| -p * p.ln())
329            .sum();
330        (h_x + h_y - h_xy).max(0.0)
331    }
332    /// Data processing inequality: I(X;Z) ≤ I(X;Y) for Markov chain X → Y → Z.
333    ///
334    /// Verifies that for the given joint tables, I(X;Z) ≤ I(X;Y).
335    pub fn data_processing_inequality(joint_xy: &[Vec<f64>], joint_xz: &[Vec<f64>]) -> bool {
336        let i_xy = Self::compute(joint_xy);
337        let i_xz = Self::compute(joint_xz);
338        i_xz <= i_xy + 1e-9
339    }
340}
341/// Ensemble method: gradient boosting.
342#[allow(dead_code)]
343#[derive(Debug, Clone)]
344pub struct GradientBoosting {
345    pub n_estimators: usize,
346    pub learning_rate: f64,
347    pub max_depth: usize,
348    pub loss: GBLoss,
349}
350#[allow(dead_code)]
351impl GradientBoosting {
352    pub fn new(n: usize, lr: f64, depth: usize, loss: GBLoss) -> Self {
353        GradientBoosting {
354            n_estimators: n,
355            learning_rate: lr,
356            max_depth: depth,
357            loss,
358        }
359    }
360    pub fn xgboost_style(n: usize) -> Self {
361        GradientBoosting::new(n, 0.1, 6, GBLoss::MSE)
362    }
363    pub fn effective_shrinkage(&self) -> f64 {
364        self.learning_rate
365    }
366    pub fn n_leaves_upper_bound(&self) -> usize {
367        2usize.pow(self.max_depth as u32)
368    }
369    pub fn is_regularized(&self) -> bool {
370        self.learning_rate < 1.0
371    }
372}
373/// Kernel (Gram) matrix K_{ij} = k(x_i, x_j).
374pub struct KernelMatrix {
375    /// The matrix entries.
376    pub entries: Vec<Vec<f64>>,
377    /// Number of data points n.
378    pub n: usize,
379}
380impl KernelMatrix {
381    /// Compute the kernel matrix for a dataset and kernel function.
382    pub fn compute(kernel: &KernelFunction, data: &[Vec<f64>]) -> Self {
383        let n = data.len();
384        let entries: Vec<Vec<f64>> = (0..n)
385            .map(|i| {
386                (0..n)
387                    .map(|j| kernel.evaluate(&data[i], &data[j]))
388                    .collect()
389            })
390            .collect();
391        Self { entries, n }
392    }
393    /// Trace of the kernel matrix (sum of diagonal entries).
394    pub fn trace(&self) -> f64 {
395        (0..self.n).map(|i| self.entries[i][i]).sum()
396    }
397}
398#[allow(dead_code)]
399#[derive(Debug, Clone, PartialEq)]
400pub enum SVMKernel {
401    Linear,
402    Polynomial(u32),
403    RBF(f64),
404    Sigmoid,
405}
406/// KL divergence D_KL(P‖Q) = Σ P(x) log(P(x)/Q(x)).
407pub struct KLDivergence;
408impl KLDivergence {
409    /// Compute D_KL(p ‖ q) in nats.  Returns ∞ if q(x) = 0 when p(x) > 0.
410    pub fn compute(p: &[f64], q: &[f64]) -> f64 {
411        p.iter()
412            .zip(q.iter())
413            .filter(|(&pi, _)| pi > 0.0)
414            .map(|(&pi, &qi)| {
415                if qi == 0.0 {
416                    f64::INFINITY
417                } else {
418                    pi * (pi / qi).ln()
419                }
420            })
421            .sum()
422    }
423    /// Non-negativity check: D_KL(p‖q) ≥ 0.
424    pub fn is_nonneg(p: &[f64], q: &[f64]) -> bool {
425        Self::compute(p, q) >= -1e-9
426    }
427}
428/// VC dimension and growth function calculations.
429pub struct VCDimension {
430    /// The claimed VC dimension.
431    pub dim: usize,
432}
433impl VCDimension {
434    /// Create a new VCDimension.
435    pub fn new(dim: usize) -> Self {
436        Self { dim }
437    }
438    /// Sauer-Shelah bound: Π_H(m) ≤ Σ_{i=0}^{d} C(m,i).
439    pub fn sauer_shelah_bound(&self, m: usize) -> usize {
440        let d = self.dim;
441        let mut bound = 0usize;
442        let mut binom = 1usize;
443        for i in 0..=d.min(m) {
444            if i > 0 {
445                binom = binom
446                    .saturating_mul(m - i + 1)
447                    .checked_div(i)
448                    .unwrap_or(binom);
449            }
450            bound = bound.saturating_add(binom);
451        }
452        bound
453    }
454    /// Check if d-dimensional threshold classifier can shatter m points.
455    ///
456    /// For the canonical 1D threshold classifier H = {h_θ : θ ∈ ℝ}, VC dim = 1.
457    /// This checks whether the bound is consistent with shattering.
458    pub fn can_shatter(&self, m: usize) -> bool {
459        m <= self.dim
460    }
461    /// Fundamental theorem of PAC learning: finite VC dim ↔ PAC learnability.
462    pub fn fundamental_theorem_pac(&self) -> bool {
463        self.dim < usize::MAX
464    }
465}
466/// Available kernel types.
467pub enum KernelType {
468    /// Linear kernel: k(x,y) = x·y.
469    Linear,
470    /// Polynomial kernel: k(x,y) = (x·y + c)^d.
471    Polynomial { degree: u32, offset: f64 },
472    /// RBF/Gaussian kernel: k(x,y) = exp(-‖x-y‖²/(2σ²)).
473    Rbf { sigma: f64 },
474    /// Laplace kernel: k(x,y) = exp(-‖x-y‖/σ).
475    Laplace { sigma: f64 },
476}
477/// Regret bound summary.
478pub struct RegretBound {
479    /// Number of rounds T.
480    pub t: usize,
481    /// Domain diameter D.
482    pub d: f64,
483    /// Gradient norm bound G.
484    pub g: f64,
485}
486impl RegretBound {
487    /// Create a new RegretBound.
488    pub fn new(t: usize, d: f64, g: f64) -> Self {
489        Self { t, d, g }
490    }
491    /// OGD regret bound: D * G * √(2T).
492    pub fn ogd_bound(&self) -> f64 {
493        self.d * self.g * (2.0 * self.t as f64).sqrt()
494    }
495}
496/// Sample complexity for PAC learning (Blumer et al. / Vapnik-Chervonenkis bound).
497///
498/// Returns m = ceil((d * ln(d/eps + 1) + ln(2/delta)) / eps) samples.
499/// Here `vc_dim` is the VC dimension d of the hypothesis class.
500pub struct SampleComplexity {
501    /// Accuracy parameter ε ∈ (0,1).
502    pub eps: f64,
503    /// Confidence parameter δ ∈ (0,1).
504    pub delta: f64,
505    /// VC dimension d of the hypothesis class.
506    pub vc_dim: usize,
507}
508impl SampleComplexity {
509    /// Create a new SampleComplexity instance.
510    pub fn new(eps: f64, delta: f64, vc_dim: usize) -> Self {
511        Self { eps, delta, vc_dim }
512    }
513    /// Compute the sample complexity upper bound.
514    pub fn compute(&self) -> usize {
515        let d = self.vc_dim as f64;
516        let numerator = d * (d / self.eps + 1.0).ln() + (2.0 / self.delta).ln();
517        (numerator / self.eps).ceil() as usize
518    }
519}
520/// Kernel SVM trainer using a simplified SMO algorithm.
521///
522/// Implements the Sequential Minimal Optimization (SMO) core update step.
523pub struct KernelSVMTrainer {
524    /// Number of training points.
525    pub n: usize,
526    /// Dual variables α_i ∈ \[0, C\].
527    pub alphas: Vec<f64>,
528    /// Labels y_i ∈ {-1, +1}.
529    pub labels: Vec<f64>,
530    /// Bias term b.
531    pub bias: f64,
532    /// Regularization parameter C (upper bound on α_i).
533    pub c: f64,
534}
535impl KernelSVMTrainer {
536    /// Create a new KernelSVMTrainer with zero alphas.
537    pub fn new(n: usize, labels: Vec<f64>, c: f64) -> Self {
538        Self {
539            n,
540            alphas: vec![0.0; n],
541            labels,
542            bias: 0.0,
543            c,
544        }
545    }
546    /// Compute the SVM decision function f(x) = Σ α_i y_i k(x_i, x) + b.
547    pub fn decision(&self, kernel_row: &[f64]) -> f64 {
548        self.alphas
549            .iter()
550            .zip(self.labels.iter())
551            .zip(kernel_row.iter())
552            .map(|((a, y), k)| a * y * k)
553            .sum::<f64>()
554            + self.bias
555    }
556    /// One SMO update step on a pair (i, j) given kernel matrix K.
557    ///
558    /// Returns true if a meaningful update was made.
559    pub fn smo_step(&mut self, i: usize, j: usize, k: &[Vec<f64>]) -> bool {
560        if i == j {
561            return false;
562        }
563        let yi = self.labels[i];
564        let yj = self.labels[j];
565        let fi = self.decision(&k[i]);
566        let fj = self.decision(&k[j]);
567        let ei = fi - yi;
568        let ej = fj - yj;
569        let eta = k[i][i] + k[j][j] - 2.0 * k[i][j];
570        if eta <= 0.0 {
571            return false;
572        }
573        let alpha_j_old = self.alphas[j];
574        let alpha_i_old = self.alphas[i];
575        let (l, h) = if (yi - yj).abs() < 1e-9 {
576            let sum = alpha_i_old + alpha_j_old;
577            ((sum - self.c).max(0.0), sum.min(self.c))
578        } else {
579            let diff = alpha_j_old - alpha_i_old;
580            ((-diff).max(0.0), (self.c - diff).min(self.c))
581        };
582        let alpha_j_new = (alpha_j_old + yj * (ei - ej) / eta).clamp(l, h);
583        if (alpha_j_new - alpha_j_old).abs() < 1e-5 {
584            return false;
585        }
586        let alpha_i_new = alpha_i_old + yi * yj * (alpha_j_old - alpha_j_new);
587        let b1 = self.bias
588            - ei
589            - yi * (alpha_i_new - alpha_i_old) * k[i][i]
590            - yj * (alpha_j_new - alpha_j_old) * k[i][j];
591        let b2 = self.bias
592            - ej
593            - yi * (alpha_i_new - alpha_i_old) * k[i][j]
594            - yj * (alpha_j_new - alpha_j_old) * k[j][j];
595        self.bias = (b1 + b2) / 2.0;
596        self.alphas[i] = alpha_i_new;
597        self.alphas[j] = alpha_j_new;
598        true
599    }
600    /// SVM generalization bound: ≤ R²/(γ² n) with margin γ.
601    pub fn generalization_bound(radius: f64, margin: f64, n: usize) -> f64 {
602        (radius * radius) / (margin * margin * n as f64)
603    }
604}
605/// PAC learner: wraps accuracy/confidence parameters.
606pub struct PACLearner {
607    /// ε: maximum tolerated generalization error.
608    pub eps: f64,
609    /// δ: failure probability (confidence 1−δ).
610    pub delta: f64,
611}
612impl PACLearner {
613    /// Create a new PAC learner.
614    pub fn new(eps: f64, delta: f64) -> Self {
615        Self { eps, delta }
616    }
617    /// Required sample size for a hypothesis class of VC dimension d.
618    pub fn required_samples(&self, vc_dim: usize) -> usize {
619        SampleComplexity::new(self.eps, self.delta, vc_dim).compute()
620    }
621}
622/// Evidence Lower Bound (ELBO) for variational inference.
623///
624/// ℒ(q) = E_q\[log p(x,z)\] - E_q\[log q(z)\] = log p(x) - D_KL(q(z) ‖ p(z|x))
625pub struct ELBO {
626    /// D_KL(q‖p) component.
627    pub kl_term: f64,
628    /// E_q\[log p(x,z)\] reconstruction term.
629    pub reconstruction_term: f64,
630}
631impl ELBO {
632    /// Create a new ELBO from its components.
633    pub fn new(reconstruction_term: f64, kl_term: f64) -> Self {
634        Self {
635            kl_term,
636            reconstruction_term,
637        }
638    }
639    /// Compute ℒ(q) = reconstruction_term - kl_term.
640    pub fn value(&self) -> f64 {
641        self.reconstruction_term - self.kl_term
642    }
643    /// Compute ELBO from discrete distributions q(z) and joint p(x,z).
644    ///
645    /// ℒ = Σ_z q(z) log(p(x,z)/q(z))
646    pub fn compute(q: &[f64], p_joint: &[f64]) -> f64 {
647        q.iter()
648            .zip(p_joint.iter())
649            .filter(|(&qi, _)| qi > 0.0)
650            .map(|(&qi, &pi)| {
651                if pi == 0.0 {
652                    f64::NEG_INFINITY
653                } else {
654                    qi * (pi / qi).ln()
655                }
656            })
657            .sum()
658    }
659}
660/// Tikhonov (ridge) regularization: min_h L(h) + λ‖h‖².
661pub struct TikhonovReg {
662    /// Regularization parameter λ.
663    pub lambda: f64,
664}
665impl TikhonovReg {
666    /// Create a new TikhonovReg.
667    pub fn new(lambda: f64) -> Self {
668        Self { lambda }
669    }
670    /// Ridge regression closed-form solution: w = (X^T X + λI)^{-1} X^T y.
671    /// Here X is n×d (row major), y is length-n.  Returns weight vector w.
672    pub fn solve(&self, x: &[Vec<f64>], y: &[f64]) -> Vec<f64> {
673        let d = if x.is_empty() { 0 } else { x[0].len() };
674        let n = x.len();
675        let mut xtx = vec![vec![0.0f64; d]; d];
676        for i in 0..d {
677            xtx[i][i] = self.lambda;
678        }
679        for row in x {
680            for i in 0..d {
681                for j in 0..d {
682                    xtx[i][j] += row[i] * row[j];
683                }
684            }
685        }
686        let mut xty = vec![0.0f64; d];
687        for (row, &yi) in x.iter().zip(y.iter()) {
688            for j in 0..d {
689                xty[j] += row[j] * yi;
690            }
691        }
692        gauss_solve(&xtx, &xty, d, n)
693    }
694    /// Regularization penalty: λ‖w‖².
695    pub fn penalty(&self, w: &[f64]) -> f64 {
696        self.lambda * w.iter().map(|wi| wi * wi).sum::<f64>()
697    }
698}
699/// Backdoor adjustment formula for causal inference.
700///
701/// Computes P(Y | do(X=x)) = Σ_z P(Y | X=x, Z=z) * P(Z=z)
702/// given a set of confounder strata z.
703pub struct CausalBackdoor {
704    /// Conditional probabilities P(Y=1 | X=x, Z=z) for each stratum z.
705    pub cond_probs: Vec<f64>,
706    /// Marginal probabilities P(Z=z) for each stratum z.
707    pub stratum_probs: Vec<f64>,
708}
709impl CausalBackdoor {
710    /// Create a new CausalBackdoor instance.
711    pub fn new(cond_probs: Vec<f64>, stratum_probs: Vec<f64>) -> Self {
712        Self {
713            cond_probs,
714            stratum_probs,
715        }
716    }
717    /// Compute P(Y=1 | do(X=x)) via backdoor adjustment.
718    ///
719    /// Returns Σ_z P(Y=1 | X=x, Z=z) * P(Z=z).
720    pub fn adjust(&self) -> f64 {
721        self.cond_probs
722            .iter()
723            .zip(self.stratum_probs.iter())
724            .map(|(p, q)| p * q)
725            .sum()
726    }
727    /// Compute the confounding bias: |observational P(Y|X=x) - interventional P(Y|do(X=x))|.
728    pub fn confounding_bias(&self, observational: f64) -> f64 {
729        (observational - self.adjust()).abs()
730    }
731    /// Verify the backdoor adjustment probabilities sum to ≤ 1 (sanity check).
732    pub fn is_valid(&self) -> bool {
733        let sum: f64 = self.stratum_probs.iter().sum();
734        (sum - 1.0).abs() < 1e-6
735    }
736}
737/// Double Rademacher (two-sided) bound.
738pub struct DoubleRademacher {
739    /// Rademacher complexity instance.
740    pub rademacher: RademacherComplexity,
741}
742impl DoubleRademacher {
743    /// Create a new DoubleRademacher instance.
744    pub fn new(n: usize, class_size: usize) -> Self {
745        Self {
746            rademacher: RademacherComplexity::new(n, class_size),
747        }
748    }
749    /// Two-sided bound: |L_D(h) - L_S(h)| ≤ 2 R_n(H) w.h.p.
750    pub fn two_sided_bound(&self) -> f64 {
751        2.0 * self.rademacher.upper_bound()
752    }
753}
754/// Online Perceptron classifier.
755pub struct Perceptron {
756    /// Weight vector w ∈ ℝ^d.
757    pub weights: Vec<f64>,
758    /// Bias term b.
759    pub bias: f64,
760    /// Number of mistakes made so far.
761    pub mistakes: usize,
762}
763impl Perceptron {
764    /// Create a new zero-initialized Perceptron.
765    pub fn new(dim: usize) -> Self {
766        Self {
767            weights: vec![0.0; dim],
768            bias: 0.0,
769            mistakes: 0,
770        }
771    }
772    /// Predict the label for input x: sign(w·x + b).
773    pub fn predict(&self, x: &[f64]) -> f64 {
774        let score = dot(&self.weights, x) + self.bias;
775        if score >= 0.0 {
776            1.0
777        } else {
778            -1.0
779        }
780    }
781    /// Online update: if prediction wrong, w ← w + y·x, b ← b + y.
782    pub fn update(&mut self, x: &[f64], label: f64) -> bool {
783        let pred = self.predict(x);
784        if (pred * label) <= 0.0 {
785            for (wi, &xi) in self.weights.iter_mut().zip(x.iter()) {
786                *wi += label * xi;
787            }
788            self.bias += label;
789            self.mistakes += 1;
790            true
791        } else {
792            false
793        }
794    }
795    /// Perceptron mistake bound: M ≤ (R/γ)² where R = radius, γ = margin.
796    pub fn mistake_bound(radius: f64, margin: f64) -> usize {
797        ((radius / margin).powi(2)).ceil() as usize
798    }
799}
800/// Gaussian complexity (Gaussian analog of Rademacher).
801pub struct GaussianComplexity {
802    /// Number of samples n.
803    pub n: usize,
804    /// Number of hypotheses in the class.
805    pub class_size: usize,
806}
807impl GaussianComplexity {
808    /// Create a new GaussianComplexity instance.
809    pub fn new(n: usize, class_size: usize) -> Self {
810        Self { n, class_size }
811    }
812    /// Upper bound for Gaussian complexity: √(2 ln|H| / n) (same as Rademacher by comparison).
813    pub fn upper_bound(&self) -> f64 {
814        (2.0 * (self.class_size as f64).ln() / self.n as f64).sqrt()
815    }
816}
817#[allow(dead_code)]
818#[derive(Debug, Clone, PartialEq)]
819pub enum GBLoss {
820    MSE,
821    MAE,
822    LogLoss,
823    Huber(f64),
824}
825/// Lasso (ℓ₁) regularization: min_h L(h) + λ‖h‖₁.
826pub struct LassoReg {
827    /// Regularization parameter λ.
828    pub lambda: f64,
829}
830impl LassoReg {
831    /// Create a new LassoReg.
832    pub fn new(lambda: f64) -> Self {
833        Self { lambda }
834    }
835    /// Soft-thresholding operator: sign(w) * max(|w| - λ, 0) per coordinate.
836    pub fn soft_threshold(&self, w: &[f64]) -> Vec<f64> {
837        w.iter()
838            .map(|&wi| {
839                let abs_wi = wi.abs();
840                if abs_wi <= self.lambda {
841                    0.0
842                } else {
843                    wi.signum() * (abs_wi - self.lambda)
844                }
845            })
846            .collect()
847    }
848    /// Regularization penalty: λ‖w‖₁.
849    pub fn penalty(&self, w: &[f64]) -> f64 {
850        self.lambda * w.iter().map(|wi| wi.abs()).sum::<f64>()
851    }
852}
853/// UCB1 (Upper Confidence Bound) algorithm for multi-armed bandits.
854///
855/// Achieves cumulative regret O(√(n T ln T)) where n = number of arms.
856pub struct UCBBandit {
857    /// Number of arms.
858    pub n: usize,
859    /// Number of times each arm has been pulled.
860    pub counts: Vec<usize>,
861    /// Empirical mean reward for each arm.
862    pub means: Vec<f64>,
863    /// Total rounds elapsed.
864    pub t: usize,
865}
866impl UCBBandit {
867    /// Create a new UCB1 bandit instance.
868    pub fn new(n: usize) -> Self {
869        Self {
870            n,
871            counts: vec![0; n],
872            means: vec![0.0; n],
873            t: 0,
874        }
875    }
876    /// Select the arm with the highest UCB index.
877    ///
878    /// UCB index for arm i: μ_i + √(2 ln t / n_i).
879    /// Arms with count 0 are always selected first (infinite UCB).
880    pub fn select(&self) -> usize {
881        if let Some(i) = self.counts.iter().position(|&c| c == 0) {
882            return i;
883        }
884        let ln_t = (self.t as f64).ln();
885        (0..self.n)
886            .max_by(|&i, &j| {
887                let ucb_i = self.means[i] + (2.0 * ln_t / self.counts[i] as f64).sqrt();
888                let ucb_j = self.means[j] + (2.0 * ln_t / self.counts[j] as f64).sqrt();
889                ucb_i
890                    .partial_cmp(&ucb_j)
891                    .unwrap_or(std::cmp::Ordering::Equal)
892            })
893            .unwrap_or(0)
894    }
895    /// Update the chosen arm with observed reward.
896    pub fn update(&mut self, arm: usize, reward: f64) {
897        self.counts[arm] += 1;
898        let n = self.counts[arm] as f64;
899        self.means[arm] += (reward - self.means[arm]) / n;
900        self.t += 1;
901    }
902    /// UCB1 regret bound: O(√(n T ln T)).
903    pub fn regret_bound_upper(&self) -> f64 {
904        let t = self.t as f64;
905        let n = self.n as f64;
906        (n * t * t.ln()).sqrt()
907    }
908}
909/// PAC-Bayes generalization bound computation.
910///
911/// McAllester's bound: L_D(Q) ≤ L_S(Q) + √((KL(Q‖P) + ln(n/δ)) / (2n)).
912pub struct PACBayesGeneralization {
913    /// KL divergence KL(Q‖P) in nats.
914    pub kl_qp: f64,
915    /// Number of training samples n.
916    pub n: usize,
917    /// Confidence parameter δ.
918    pub delta: f64,
919}
920impl PACBayesGeneralization {
921    /// Create a new PAC-Bayes bound instance.
922    pub fn new(kl_qp: f64, n: usize, delta: f64) -> Self {
923        Self { kl_qp, n, delta }
924    }
925    /// McAllester bound: empirical_loss + √((KL + ln(n/δ)) / (2n)).
926    pub fn mcallester_bound(&self, empirical_loss: f64) -> f64 {
927        let penalty =
928            ((self.kl_qp + (self.n as f64 / self.delta).ln()) / (2.0 * self.n as f64)).sqrt();
929        empirical_loss + penalty
930    }
931    /// Catoni's tighter bound (using solve for λ-parameterized form).
932    /// Approximation: L_D(Q) ≤ (1/(1-λ/2)) * (L_S(Q) + KL/(2λn)).
933    pub fn catoni_bound(&self, empirical_loss: f64, lambda: f64) -> f64 {
934        let scale = 1.0 / (1.0 - lambda / 2.0);
935        let penalty = self.kl_qp / (2.0 * lambda * self.n as f64);
936        scale * (empirical_loss + penalty)
937    }
938    /// Optimal λ for Catoni bound (minimizing RHS).
939    pub fn optimal_lambda(&self, empirical_loss: f64) -> f64 {
940        let ratio = self.n as f64 * empirical_loss / (self.kl_qp.max(1e-9));
941        1.0 / (1.0 + ratio).sqrt()
942    }
943}
944/// Exponential Weights Algorithm (Hedge / EWA) for online learning.
945///
946/// Maintains a distribution over n experts and uses multiplicative updates.
947/// Guarantees regret R_T ≤ ln(n)/η + η T/2.
948pub struct ExponentialWeightsAlgorithm {
949    /// Number of experts.
950    pub n: usize,
951    /// Learning rate η.
952    pub eta: f64,
953    /// Current weights (unnormalized).
954    pub weights: Vec<f64>,
955    /// Total rounds elapsed.
956    pub rounds: usize,
957}
958impl ExponentialWeightsAlgorithm {
959    /// Create a new EWA with uniform initial weights.
960    pub fn new(n: usize, eta: f64) -> Self {
961        Self {
962            n,
963            eta,
964            weights: vec![1.0; n],
965            rounds: 0,
966        }
967    }
968    /// Return the current probability distribution over experts.
969    pub fn distribution(&self) -> Vec<f64> {
970        let total: f64 = self.weights.iter().sum();
971        self.weights.iter().map(|w| w / total).collect()
972    }
973    /// Multiplicative update: w_i ← w_i * exp(-η * loss_i).
974    pub fn update(&mut self, losses: &[f64]) {
975        for (w, &l) in self.weights.iter_mut().zip(losses.iter()) {
976            *w *= (-self.eta * l).exp();
977        }
978        self.rounds += 1;
979    }
980    /// EWA regret bound: R_T ≤ ln(n)/η + η * T / 2.
981    pub fn regret_bound(&self) -> f64 {
982        let t = self.rounds as f64;
983        (self.n as f64).ln() / self.eta + self.eta * t / 2.0
984    }
985    /// Optimal learning rate for T rounds: η* = √(2 ln n / T).
986    pub fn optimal_eta(n: usize, t: usize) -> f64 {
987        (2.0 * (n as f64).ln() / t as f64).sqrt()
988    }
989}
990/// Feature map: maps inputs to a (truncated) explicit feature space.
991pub struct FeatureMap {
992    /// Dimensionality of the feature space.
993    pub feature_dim: usize,
994}
995impl FeatureMap {
996    /// Create a new feature map.
997    pub fn new(feature_dim: usize) -> Self {
998        Self { feature_dim }
999    }
1000    /// Compute the inner product ⟨φ(x), φ(y)⟩ in feature space.
1001    /// For the linear kernel, φ(x) = x, so this is just the dot product.
1002    pub fn inner_product(&self, x: &[f64], y: &[f64]) -> f64 {
1003        dot(x, y)
1004    }
1005}
1006/// Kernel SVM dual representation.
1007pub struct KernelSVM {
1008    /// Dual variables α_i.
1009    pub alphas: Vec<f64>,
1010    /// Labels y_i ∈ {-1, +1}.
1011    pub labels: Vec<f64>,
1012    /// Bias term b.
1013    pub bias: f64,
1014    /// Regularization parameter C.
1015    pub c: f64,
1016}
1017impl KernelSVM {
1018    /// Create a new kernel SVM (initialized to zero weights).
1019    pub fn new(n: usize, c: f64) -> Self {
1020        Self {
1021            alphas: vec![0.0; n],
1022            labels: vec![1.0; n],
1023            bias: 0.0,
1024            c,
1025        }
1026    }
1027    /// Decision function: f(x) = Σ α_i y_i k(x_i, x) + b.
1028    pub fn predict(&self, kernel_vals: &[f64]) -> f64 {
1029        let sum: f64 = self
1030            .alphas
1031            .iter()
1032            .zip(self.labels.iter())
1033            .zip(kernel_vals.iter())
1034            .map(|((a, y), k)| a * y * k)
1035            .sum();
1036        sum + self.bias
1037    }
1038}
1039/// Gaussian process regression model.
1040#[allow(dead_code)]
1041#[derive(Debug, Clone)]
1042pub struct GaussianProcess {
1043    pub mean: f64,
1044    pub length_scale: f64,
1045    pub signal_variance: f64,
1046    pub noise_variance: f64,
1047}
1048#[allow(dead_code)]
1049impl GaussianProcess {
1050    pub fn new(mean: f64, length_scale: f64, signal_var: f64, noise_var: f64) -> Self {
1051        GaussianProcess {
1052            mean,
1053            length_scale,
1054            signal_variance: signal_var,
1055            noise_variance: noise_var,
1056        }
1057    }
1058    pub fn default_rbf() -> Self {
1059        GaussianProcess::new(0.0, 1.0, 1.0, 0.01)
1060    }
1061    /// RBF (squared exponential) kernel: k(x, x') = σ^2 exp(-|x-x'|^2 / (2l^2)).
1062    pub fn rbf_kernel(&self, x: f64, xp: f64) -> f64 {
1063        let d = x - xp;
1064        self.signal_variance * (-d * d / (2.0 * self.length_scale.powi(2))).exp()
1065    }
1066    /// Predictive variance at a new point (simplified: just signal variance).
1067    pub fn predictive_variance(&self, x: f64, train_x: &[f64]) -> f64 {
1068        let k_star_star = self.rbf_kernel(x, x);
1069        let k_noise: Vec<f64> = train_x.iter().map(|&xi| self.rbf_kernel(x, xi)).collect();
1070        let contrib: f64 = k_noise.iter().map(|&k| k * k).sum::<f64>()
1071            / (self.signal_variance + self.noise_variance).max(1e-10);
1072        (k_star_star - contrib).max(self.noise_variance)
1073    }
1074    pub fn log_marginal_likelihood_approx(&self, n: usize) -> f64 {
1075        -(n as f64) / 2.0 * (2.0 * std::f64::consts::PI * self.signal_variance).ln()
1076    }
1077}
1078/// A kernel function k: ℝ^d × ℝ^d → ℝ.
1079pub struct KernelFunction {
1080    /// Kernel type identifier.
1081    pub kernel_type: KernelType,
1082}
1083impl KernelFunction {
1084    /// Create a new kernel function.
1085    pub fn new(kernel_type: KernelType) -> Self {
1086        Self { kernel_type }
1087    }
1088    /// Evaluate the kernel k(x, y) where x, y are vectors in ℝ^d.
1089    pub fn evaluate(&self, x: &[f64], y: &[f64]) -> f64 {
1090        match &self.kernel_type {
1091            KernelType::Linear => dot(x, y),
1092            KernelType::Polynomial { degree, offset } => (dot(x, y) + offset).powi(*degree as i32),
1093            KernelType::Rbf { sigma } => {
1094                let diff_sq: f64 = x.iter().zip(y).map(|(a, b)| (a - b).powi(2)).sum();
1095                (-diff_sq / (2.0 * sigma * sigma)).exp()
1096            }
1097            KernelType::Laplace { sigma } => {
1098                let diff_norm: f64 = x.iter().zip(y).map(|(a, b)| (a - b).abs()).sum();
1099                (-diff_norm / sigma).exp()
1100            }
1101        }
1102    }
1103    /// Check if the kernel matrix for a set of points is positive semi-definite
1104    /// (via Cholesky: all pivots ≥ -ε for numerical tolerance).
1105    pub fn is_positive_definite(&self, points: &[Vec<f64>]) -> bool {
1106        let n = points.len();
1107        let mut k: Vec<Vec<f64>> = (0..n)
1108            .map(|i| {
1109                (0..n)
1110                    .map(|j| self.evaluate(&points[i], &points[j]))
1111                    .collect()
1112            })
1113            .collect();
1114        for i in 0..n {
1115            for j in 0..i {
1116                let mut sum = k[i][j];
1117                for l in 0..j {
1118                    sum -= k[i][l] * k[j][l];
1119                }
1120                k[i][j] = sum / k[j][j];
1121            }
1122            let mut diag = k[i][i];
1123            for l in 0..i {
1124                diag -= k[i][l] * k[i][l];
1125            }
1126            if diag < -1e-9 {
1127                return false;
1128            }
1129            k[i][i] = diag.max(0.0).sqrt();
1130        }
1131        true
1132    }
1133}