Skip to main content

oxilean_std/statistical_learning/
types.rs

1//! Auto-generated module
2//!
3//! 🤖 Generated with [SplitRS](https://github.com/cool-japan/splitrs)
4
5use super::functions::*;
6/// Growth function Π_H(m) for a hypothesis class.
7pub struct GrowthFunction {
8    /// VC dimension of the hypothesis class.
9    pub vc_dim: usize,
10}
11impl GrowthFunction {
12    /// Create a new GrowthFunction.
13    pub fn new(vc_dim: usize) -> Self {
14        Self { vc_dim }
15    }
16    /// Evaluate the Sauer-Shelah upper bound for Π_H(m).
17    pub fn evaluate(&self, m: usize) -> usize {
18        VCDimension::new(self.vc_dim).sauer_shelah_bound(m)
19    }
20}
21/// Cross-validation scheme.
22#[allow(dead_code)]
23#[derive(Debug, Clone)]
24pub struct CrossValidation {
25    pub n_folds: usize,
26    pub n_samples: usize,
27    pub shuffle: bool,
28    pub stratified: bool,
29}
30#[allow(dead_code)]
31impl CrossValidation {
32    pub fn new(k: usize, n: usize) -> Self {
33        CrossValidation {
34            n_folds: k,
35            n_samples: n,
36            shuffle: true,
37            stratified: false,
38        }
39    }
40    pub fn k_fold_5(n: usize) -> Self {
41        CrossValidation::new(5, n)
42    }
43    pub fn loocv(n: usize) -> Self {
44        CrossValidation::new(n, n)
45    }
46    pub fn fold_size(&self) -> usize {
47        (self.n_samples + self.n_folds - 1) / self.n_folds
48    }
49    pub fn train_size(&self) -> usize {
50        self.n_samples - self.fold_size()
51    }
52    pub fn n_train_test_splits(&self) -> usize {
53        self.n_folds
54    }
55}
56/// Early stopping regularization — implicit regularization by iteration count.
57pub struct EarlyStoppingReg {
58    /// Maximum number of gradient descent iterations.
59    pub max_iters: usize,
60    /// Step size.
61    pub step_size: f64,
62}
63impl EarlyStoppingReg {
64    /// Create a new EarlyStoppingReg.
65    pub fn new(max_iters: usize, step_size: f64) -> Self {
66        Self {
67            max_iters,
68            step_size,
69        }
70    }
71    /// Effective regularization parameter ≈ 1/(step_size * max_iters).
72    pub fn effective_lambda(&self) -> f64 {
73        1.0 / (self.step_size * self.max_iters as f64)
74    }
75}
76/// AdaBoost: adaptive boosting with exponential loss.
77pub struct AdaBoost {
78    /// Number of boosting rounds T.
79    pub rounds: usize,
80    /// Weights α_t for each weak learner.
81    pub alphas: Vec<f64>,
82    /// Per-round weak learner accuracies.
83    pub weak_accuracies: Vec<f64>,
84}
85impl AdaBoost {
86    /// Create a new AdaBoost instance.
87    pub fn new(rounds: usize) -> Self {
88        Self {
89            rounds,
90            alphas: Vec::new(),
91            weak_accuracies: Vec::new(),
92        }
93    }
94    /// Compute alpha_t = 0.5 * ln((1 - ε_t) / ε_t) for a weak learner with error ε_t.
95    pub fn compute_alpha(weak_error: f64) -> f64 {
96        0.5 * ((1.0 - weak_error) / weak_error).ln()
97    }
98    /// Training error bound after T rounds: ≤ exp(-2 Σ γ_t²) where γ_t = 0.5 - ε_t.
99    pub fn training_error_bound(gammas: &[f64]) -> f64 {
100        let sum_gamma_sq: f64 = gammas.iter().map(|g| g * g).sum();
101        (-2.0 * sum_gamma_sq).exp()
102    }
103    /// Record a round's weak learner accuracy.
104    pub fn add_round(&mut self, weak_accuracy: f64) {
105        let weak_error = 1.0 - weak_accuracy;
106        let alpha = Self::compute_alpha(weak_error);
107        self.alphas.push(alpha);
108        self.weak_accuracies.push(weak_accuracy);
109    }
110}
111/// Online Gradient Descent with regret bound O(√T).
112pub struct OnlineGradientDescent {
113    /// Current parameter vector w_t.
114    pub weights: Vec<f64>,
115    /// Learning rate η.
116    pub eta: f64,
117    /// Constraint set radius D (‖w‖ ≤ D).
118    pub d: f64,
119    /// Gradient norm bound G (‖∇_t‖ ≤ G).
120    pub g: f64,
121    /// Round count.
122    pub t: usize,
123}
124impl OnlineGradientDescent {
125    /// Create a new OGD instance.
126    pub fn new(dim: usize, eta: f64, d: f64, g: f64) -> Self {
127        Self {
128            weights: vec![0.0; dim],
129            eta,
130            d,
131            g,
132            t: 0,
133        }
134    }
135    /// Update: w_{t+1} = project(w_t - η ∇_t) onto ‖w‖ ≤ D.
136    pub fn update(&mut self, gradient: &[f64]) {
137        for (wi, &gi) in self.weights.iter_mut().zip(gradient.iter()) {
138            *wi -= self.eta * gi;
139        }
140        let norm: f64 = self.weights.iter().map(|wi| wi * wi).sum::<f64>().sqrt();
141        if norm > self.d {
142            let scale = self.d / norm;
143            for wi in self.weights.iter_mut() {
144                *wi *= scale;
145            }
146        }
147        self.t += 1;
148    }
149    /// Regret bound after T rounds: R_T ≤ D * G * √(2T).
150    pub fn regret_bound(&self) -> f64 {
151        self.d * self.g * (2.0 * self.t as f64).sqrt()
152    }
153}
154/// Support Vector Machine classifier.
155#[allow(dead_code)]
156#[derive(Debug, Clone)]
157pub struct SVMClassifier {
158    pub kernel_type: SVMKernel,
159    pub c_regularization: f64,
160    pub n_support_vectors: usize,
161}
162#[allow(dead_code)]
163impl SVMClassifier {
164    pub fn new(kernel: SVMKernel, c: f64) -> Self {
165        SVMClassifier {
166            kernel_type: kernel,
167            c_regularization: c,
168            n_support_vectors: 0,
169        }
170    }
171    pub fn linear(c: f64) -> Self {
172        SVMClassifier::new(SVMKernel::Linear, c)
173    }
174    pub fn rbf(gamma: f64, c: f64) -> Self {
175        SVMClassifier::new(SVMKernel::RBF(gamma), c)
176    }
177    pub fn kernel_value(&self, x: &[f64], xp: &[f64]) -> f64 {
178        match &self.kernel_type {
179            SVMKernel::Linear => dot(x, xp),
180            SVMKernel::Polynomial(d) => (dot(x, xp) + 1.0).powi(*d as i32),
181            SVMKernel::RBF(gamma) => {
182                let sq_dist = x
183                    .iter()
184                    .zip(xp.iter())
185                    .map(|(a, b)| (a - b).powi(2))
186                    .sum::<f64>();
187                (-gamma * sq_dist).exp()
188            }
189            SVMKernel::Sigmoid => (dot(x, xp)).tanh(),
190        }
191    }
192    pub fn sparsity_ratio(&self, n_training: usize) -> f64 {
193        if n_training == 0 {
194            return 0.0;
195        }
196        self.n_support_vectors as f64 / n_training as f64
197    }
198}
199/// Fisher information I(θ) = E[(∂/∂θ log p(x;θ))²].
200pub struct FisherInformation {
201    /// Log-density function log p(x; θ) as a closure index (stored as parameter).
202    pub theta: f64,
203}
204impl FisherInformation {
205    /// Create a new FisherInformation at parameter θ.
206    pub fn new(theta: f64) -> Self {
207        Self { theta }
208    }
209    /// Numerical estimate of Fisher information via finite-difference score.
210    ///
211    /// Given samples x_i and log_density function, I(θ) ≈ (1/n) Σ (∂/∂θ log p(x_i;θ))².
212    pub fn estimate(log_density: impl Fn(f64, f64) -> f64, theta: f64, samples: &[f64]) -> f64 {
213        let h = 1e-5;
214        let n = samples.len() as f64;
215        samples
216            .iter()
217            .map(|&x| {
218                let score = (log_density(x, theta + h) - log_density(x, theta - h)) / (2.0 * h);
219                score * score
220            })
221            .sum::<f64>()
222            / n
223    }
224    /// Cramér-Rao lower bound: Var(θ̂) ≥ 1/I(θ).
225    pub fn cramer_rao_bound(&self, fisher_val: f64) -> f64 {
226        if fisher_val > 0.0 {
227            1.0 / fisher_val
228        } else {
229            f64::INFINITY
230        }
231    }
232}
233/// Rademacher complexity estimate for a finite hypothesis class.
234///
235/// For a finite class H of size |H| over n samples:
236/// R_n(H) ≤ √(2 ln|H| / n).
237pub struct RademacherComplexity {
238    /// Number of samples n.
239    pub n: usize,
240    /// Number of hypotheses in the class |H|.
241    pub class_size: usize,
242}
243impl RademacherComplexity {
244    /// Create a new RademacherComplexity bound.
245    pub fn new(n: usize, class_size: usize) -> Self {
246        Self { n, class_size }
247    }
248    /// Upper bound: √(2 ln|H| / n).
249    pub fn upper_bound(&self) -> f64 {
250        (2.0 * (self.class_size as f64).ln() / self.n as f64).sqrt()
251    }
252    /// Generalization bound: L_D(h) ≤ L_S(h) + 2 R_n(H) + √(log(2/δ)/(2n)).
253    pub fn generalization_bound(&self, empirical_loss: f64, delta: f64) -> f64 {
254        let rn = self.upper_bound();
255        let confidence_term = ((2.0 / delta).ln() / (2.0 * self.n as f64)).sqrt();
256        empirical_loss + 2.0 * rn + confidence_term
257    }
258}
259/// Uniform convergence checker.
260pub struct UniformConvergence {
261    /// ε: uniform convergence tolerance.
262    pub eps: f64,
263    /// δ: failure probability.
264    pub delta: f64,
265}
266impl UniformConvergence {
267    /// Create a new UniformConvergence instance.
268    pub fn new(eps: f64, delta: f64) -> Self {
269        Self { eps, delta }
270    }
271    /// Required samples for ε-uniform convergence for a class of size |H|.
272    pub fn required_samples(&self, class_size: usize) -> usize {
273        let log_h = (class_size as f64).ln();
274        let log_delta = (1.0 / self.delta).ln();
275        ((2.0 * (log_h + log_delta)) / (self.eps * self.eps)).ceil() as usize
276    }
277}
278/// Bias-variance tradeoff decomposition: MSE = Bias² + Variance + Noise.
279pub struct BiasVarianceTradeoff {
280    /// Squared bias of the estimator.
281    pub bias_squared: f64,
282    /// Variance of the estimator.
283    pub variance: f64,
284    /// Irreducible noise level σ².
285    pub noise: f64,
286}
287impl BiasVarianceTradeoff {
288    /// Create a new BiasVarianceTradeoff.
289    pub fn new(bias_squared: f64, variance: f64, noise: f64) -> Self {
290        Self {
291            bias_squared,
292            variance,
293            noise,
294        }
295    }
296    /// Total expected MSE = Bias² + Var + σ².
297    pub fn total_mse(&self) -> f64 {
298        self.bias_squared + self.variance + self.noise
299    }
300}
301/// Mutual information I(X;Y) = H(X) + H(Y) - H(X,Y).
302pub struct MutualInformation;
303impl MutualInformation {
304    /// Compute I(X;Y) from a joint distribution table.
305    ///
306    /// `joint[i][j]` = P(X=i, Y=j).
307    pub fn compute(joint: &[Vec<f64>]) -> f64 {
308        if joint.is_empty() {
309            return 0.0;
310        }
311        let _n_rows = joint.len();
312        let n_cols = joint[0].len();
313        let px: Vec<f64> = joint.iter().map(|row| row.iter().sum::<f64>()).collect();
314        let py: Vec<f64> = (0..n_cols)
315            .map(|j| {
316                joint
317                    .iter()
318                    .map(|row| row.get(j).copied().unwrap_or(0.0))
319                    .sum()
320            })
321            .collect();
322        let h_x: f64 = px.iter().filter(|&&p| p > 0.0).map(|&p| -p * p.ln()).sum();
323        let h_y: f64 = py.iter().filter(|&&p| p > 0.0).map(|&p| -p * p.ln()).sum();
324        let h_xy: f64 = joint
325            .iter()
326            .flat_map(|row| row.iter())
327            .filter(|&&p| p > 0.0)
328            .map(|&p| -p * p.ln())
329            .sum();
330        (h_x + h_y - h_xy).max(0.0)
331    }
332    /// Data processing inequality: I(X;Z) ≤ I(X;Y) for Markov chain X → Y → Z.
333    ///
334    /// Verifies that for the given joint tables, I(X;Z) ≤ I(X;Y).
335    pub fn data_processing_inequality(joint_xy: &[Vec<f64>], joint_xz: &[Vec<f64>]) -> bool {
336        let i_xy = Self::compute(joint_xy);
337        let i_xz = Self::compute(joint_xz);
338        i_xz <= i_xy + 1e-9
339    }
340}
341/// Ensemble method: gradient boosting.
342#[allow(dead_code)]
343#[derive(Debug, Clone)]
344pub struct GradientBoosting {
345    pub n_estimators: usize,
346    pub learning_rate: f64,
347    pub max_depth: usize,
348    pub loss: GBLoss,
349}
350#[allow(dead_code)]
351impl GradientBoosting {
352    pub fn new(n: usize, lr: f64, depth: usize, loss: GBLoss) -> Self {
353        GradientBoosting {
354            n_estimators: n,
355            learning_rate: lr,
356            max_depth: depth,
357            loss,
358        }
359    }
360    pub fn xgboost_style(n: usize) -> Self {
361        GradientBoosting::new(n, 0.1, 6, GBLoss::MSE)
362    }
363    pub fn effective_shrinkage(&self) -> f64 {
364        self.learning_rate
365    }
366    pub fn n_leaves_upper_bound(&self) -> usize {
367        2usize.pow(self.max_depth as u32)
368    }
369    pub fn is_regularized(&self) -> bool {
370        self.learning_rate < 1.0
371    }
372}
373/// Kernel (Gram) matrix K_{ij} = k(x_i, x_j).
374pub struct KernelMatrix {
375    /// The matrix entries.
376    pub entries: Vec<Vec<f64>>,
377    /// Number of data points n.
378    pub n: usize,
379}
380impl KernelMatrix {
381    /// Compute the kernel matrix for a dataset and kernel function.
382    pub fn compute(kernel: &KernelFunction, data: &[Vec<f64>]) -> Self {
383        let n = data.len();
384        let entries: Vec<Vec<f64>> = (0..n)
385            .map(|i| {
386                (0..n)
387                    .map(|j| kernel.evaluate(&data[i], &data[j]))
388                    .collect()
389            })
390            .collect();
391        Self { entries, n }
392    }
393    /// Trace of the kernel matrix (sum of diagonal entries).
394    pub fn trace(&self) -> f64 {
395        (0..self.n).map(|i| self.entries[i][i]).sum()
396    }
397}
398#[allow(dead_code)]
399#[derive(Debug, Clone, PartialEq)]
400pub enum SVMKernel {
401    Linear,
402    Polynomial(u32),
403    RBF(f64),
404    Sigmoid,
405}
406/// KL divergence D_KL(P‖Q) = Σ P(x) log(P(x)/Q(x)).
407pub struct KLDivergence;
408impl KLDivergence {
409    /// Compute D_KL(p ‖ q) in nats.  Returns ∞ if q(x) = 0 when p(x) > 0.
410    pub fn compute(p: &[f64], q: &[f64]) -> f64 {
411        p.iter()
412            .zip(q.iter())
413            .filter(|(&pi, _)| pi > 0.0)
414            .map(|(&pi, &qi)| {
415                if qi == 0.0 {
416                    f64::INFINITY
417                } else {
418                    pi * (pi / qi).ln()
419                }
420            })
421            .sum()
422    }
423    /// Non-negativity check: D_KL(p‖q) ≥ 0.
424    pub fn is_nonneg(p: &[f64], q: &[f64]) -> bool {
425        Self::compute(p, q) >= -1e-9
426    }
427}
428/// VC dimension and growth function calculations.
429pub struct VCDimension {
430    /// The claimed VC dimension.
431    pub dim: usize,
432}
433impl VCDimension {
434    /// Create a new VCDimension.
435    pub fn new(dim: usize) -> Self {
436        Self { dim }
437    }
438    /// Sauer-Shelah bound: Π_H(m) ≤ Σ_{i=0}^{d} C(m,i).
439    pub fn sauer_shelah_bound(&self, m: usize) -> usize {
440        let d = self.dim;
441        let mut bound = 0usize;
442        let mut binom = 1usize;
443        for i in 0..=d.min(m) {
444            if i > 0 {
445                binom = binom * (m - i + 1) / i;
446            }
447            bound = bound.saturating_add(binom);
448        }
449        bound
450    }
451    /// Check if d-dimensional threshold classifier can shatter m points.
452    ///
453    /// For the canonical 1D threshold classifier H = {h_θ : θ ∈ ℝ}, VC dim = 1.
454    /// This checks whether the bound is consistent with shattering.
455    pub fn can_shatter(&self, m: usize) -> bool {
456        m <= self.dim
457    }
458    /// Fundamental theorem of PAC learning: finite VC dim ↔ PAC learnability.
459    pub fn fundamental_theorem_pac(&self) -> bool {
460        self.dim < usize::MAX
461    }
462}
463/// Available kernel types.
464pub enum KernelType {
465    /// Linear kernel: k(x,y) = x·y.
466    Linear,
467    /// Polynomial kernel: k(x,y) = (x·y + c)^d.
468    Polynomial { degree: u32, offset: f64 },
469    /// RBF/Gaussian kernel: k(x,y) = exp(-‖x-y‖²/(2σ²)).
470    Rbf { sigma: f64 },
471    /// Laplace kernel: k(x,y) = exp(-‖x-y‖/σ).
472    Laplace { sigma: f64 },
473}
474/// Regret bound summary.
475pub struct RegretBound {
476    /// Number of rounds T.
477    pub t: usize,
478    /// Domain diameter D.
479    pub d: f64,
480    /// Gradient norm bound G.
481    pub g: f64,
482}
483impl RegretBound {
484    /// Create a new RegretBound.
485    pub fn new(t: usize, d: f64, g: f64) -> Self {
486        Self { t, d, g }
487    }
488    /// OGD regret bound: D * G * √(2T).
489    pub fn ogd_bound(&self) -> f64 {
490        self.d * self.g * (2.0 * self.t as f64).sqrt()
491    }
492}
493/// Sample complexity for PAC learning (Blumer et al. / Vapnik-Chervonenkis bound).
494///
495/// Returns m = ceil((d * ln(d/eps + 1) + ln(2/delta)) / eps) samples.
496/// Here `vc_dim` is the VC dimension d of the hypothesis class.
497pub struct SampleComplexity {
498    /// Accuracy parameter ε ∈ (0,1).
499    pub eps: f64,
500    /// Confidence parameter δ ∈ (0,1).
501    pub delta: f64,
502    /// VC dimension d of the hypothesis class.
503    pub vc_dim: usize,
504}
505impl SampleComplexity {
506    /// Create a new SampleComplexity instance.
507    pub fn new(eps: f64, delta: f64, vc_dim: usize) -> Self {
508        Self { eps, delta, vc_dim }
509    }
510    /// Compute the sample complexity upper bound.
511    pub fn compute(&self) -> usize {
512        let d = self.vc_dim as f64;
513        let numerator = d * (d / self.eps + 1.0).ln() + (2.0 / self.delta).ln();
514        (numerator / self.eps).ceil() as usize
515    }
516}
517/// Kernel SVM trainer using a simplified SMO algorithm.
518///
519/// Implements the Sequential Minimal Optimization (SMO) core update step.
520pub struct KernelSVMTrainer {
521    /// Number of training points.
522    pub n: usize,
523    /// Dual variables α_i ∈ [0, C].
524    pub alphas: Vec<f64>,
525    /// Labels y_i ∈ {-1, +1}.
526    pub labels: Vec<f64>,
527    /// Bias term b.
528    pub bias: f64,
529    /// Regularization parameter C (upper bound on α_i).
530    pub c: f64,
531}
532impl KernelSVMTrainer {
533    /// Create a new KernelSVMTrainer with zero alphas.
534    pub fn new(n: usize, labels: Vec<f64>, c: f64) -> Self {
535        Self {
536            n,
537            alphas: vec![0.0; n],
538            labels,
539            bias: 0.0,
540            c,
541        }
542    }
543    /// Compute the SVM decision function f(x) = Σ α_i y_i k(x_i, x) + b.
544    pub fn decision(&self, kernel_row: &[f64]) -> f64 {
545        self.alphas
546            .iter()
547            .zip(self.labels.iter())
548            .zip(kernel_row.iter())
549            .map(|((a, y), k)| a * y * k)
550            .sum::<f64>()
551            + self.bias
552    }
553    /// One SMO update step on a pair (i, j) given kernel matrix K.
554    ///
555    /// Returns true if a meaningful update was made.
556    pub fn smo_step(&mut self, i: usize, j: usize, k: &[Vec<f64>]) -> bool {
557        if i == j {
558            return false;
559        }
560        let yi = self.labels[i];
561        let yj = self.labels[j];
562        let fi = self.decision(&k[i]);
563        let fj = self.decision(&k[j]);
564        let ei = fi - yi;
565        let ej = fj - yj;
566        let eta = k[i][i] + k[j][j] - 2.0 * k[i][j];
567        if eta <= 0.0 {
568            return false;
569        }
570        let alpha_j_old = self.alphas[j];
571        let alpha_i_old = self.alphas[i];
572        let (l, h) = if (yi - yj).abs() < 1e-9 {
573            let sum = alpha_i_old + alpha_j_old;
574            ((sum - self.c).max(0.0), sum.min(self.c))
575        } else {
576            let diff = alpha_j_old - alpha_i_old;
577            ((-diff).max(0.0), (self.c - diff).min(self.c))
578        };
579        let alpha_j_new = (alpha_j_old + yj * (ei - ej) / eta).clamp(l, h);
580        if (alpha_j_new - alpha_j_old).abs() < 1e-5 {
581            return false;
582        }
583        let alpha_i_new = alpha_i_old + yi * yj * (alpha_j_old - alpha_j_new);
584        let b1 = self.bias
585            - ei
586            - yi * (alpha_i_new - alpha_i_old) * k[i][i]
587            - yj * (alpha_j_new - alpha_j_old) * k[i][j];
588        let b2 = self.bias
589            - ej
590            - yi * (alpha_i_new - alpha_i_old) * k[i][j]
591            - yj * (alpha_j_new - alpha_j_old) * k[j][j];
592        self.bias = (b1 + b2) / 2.0;
593        self.alphas[i] = alpha_i_new;
594        self.alphas[j] = alpha_j_new;
595        true
596    }
597    /// SVM generalization bound: ≤ R²/(γ² n) with margin γ.
598    pub fn generalization_bound(radius: f64, margin: f64, n: usize) -> f64 {
599        (radius * radius) / (margin * margin * n as f64)
600    }
601}
602/// PAC learner: wraps accuracy/confidence parameters.
603pub struct PACLearner {
604    /// ε: maximum tolerated generalization error.
605    pub eps: f64,
606    /// δ: failure probability (confidence 1−δ).
607    pub delta: f64,
608}
609impl PACLearner {
610    /// Create a new PAC learner.
611    pub fn new(eps: f64, delta: f64) -> Self {
612        Self { eps, delta }
613    }
614    /// Required sample size for a hypothesis class of VC dimension d.
615    pub fn required_samples(&self, vc_dim: usize) -> usize {
616        SampleComplexity::new(self.eps, self.delta, vc_dim).compute()
617    }
618}
619/// Evidence Lower Bound (ELBO) for variational inference.
620///
621/// ℒ(q) = E_q[log p(x,z)] - E_q[log q(z)] = log p(x) - D_KL(q(z) ‖ p(z|x))
622pub struct ELBO {
623    /// D_KL(q‖p) component.
624    pub kl_term: f64,
625    /// E_q[log p(x,z)] reconstruction term.
626    pub reconstruction_term: f64,
627}
628impl ELBO {
629    /// Create a new ELBO from its components.
630    pub fn new(reconstruction_term: f64, kl_term: f64) -> Self {
631        Self {
632            kl_term,
633            reconstruction_term,
634        }
635    }
636    /// Compute ℒ(q) = reconstruction_term - kl_term.
637    pub fn value(&self) -> f64 {
638        self.reconstruction_term - self.kl_term
639    }
640    /// Compute ELBO from discrete distributions q(z) and joint p(x,z).
641    ///
642    /// ℒ = Σ_z q(z) log(p(x,z)/q(z))
643    pub fn compute(q: &[f64], p_joint: &[f64]) -> f64 {
644        q.iter()
645            .zip(p_joint.iter())
646            .filter(|(&qi, _)| qi > 0.0)
647            .map(|(&qi, &pi)| {
648                if pi == 0.0 {
649                    f64::NEG_INFINITY
650                } else {
651                    qi * (pi / qi).ln()
652                }
653            })
654            .sum()
655    }
656}
657/// Tikhonov (ridge) regularization: min_h L(h) + λ‖h‖².
658pub struct TikhonovReg {
659    /// Regularization parameter λ.
660    pub lambda: f64,
661}
662impl TikhonovReg {
663    /// Create a new TikhonovReg.
664    pub fn new(lambda: f64) -> Self {
665        Self { lambda }
666    }
667    /// Ridge regression closed-form solution: w = (X^T X + λI)^{-1} X^T y.
668    /// Here X is n×d (row major), y is length-n.  Returns weight vector w.
669    pub fn solve(&self, x: &[Vec<f64>], y: &[f64]) -> Vec<f64> {
670        let d = if x.is_empty() { 0 } else { x[0].len() };
671        let n = x.len();
672        let mut xtx = vec![vec![0.0f64; d]; d];
673        for i in 0..d {
674            xtx[i][i] = self.lambda;
675        }
676        for row in x {
677            for i in 0..d {
678                for j in 0..d {
679                    xtx[i][j] += row[i] * row[j];
680                }
681            }
682        }
683        let mut xty = vec![0.0f64; d];
684        for (row, &yi) in x.iter().zip(y.iter()) {
685            for j in 0..d {
686                xty[j] += row[j] * yi;
687            }
688        }
689        gauss_solve(&xtx, &xty, d, n)
690    }
691    /// Regularization penalty: λ‖w‖².
692    pub fn penalty(&self, w: &[f64]) -> f64 {
693        self.lambda * w.iter().map(|wi| wi * wi).sum::<f64>()
694    }
695}
696/// Backdoor adjustment formula for causal inference.
697///
698/// Computes P(Y | do(X=x)) = Σ_z P(Y | X=x, Z=z) * P(Z=z)
699/// given a set of confounder strata z.
700pub struct CausalBackdoor {
701    /// Conditional probabilities P(Y=1 | X=x, Z=z) for each stratum z.
702    pub cond_probs: Vec<f64>,
703    /// Marginal probabilities P(Z=z) for each stratum z.
704    pub stratum_probs: Vec<f64>,
705}
706impl CausalBackdoor {
707    /// Create a new CausalBackdoor instance.
708    pub fn new(cond_probs: Vec<f64>, stratum_probs: Vec<f64>) -> Self {
709        Self {
710            cond_probs,
711            stratum_probs,
712        }
713    }
714    /// Compute P(Y=1 | do(X=x)) via backdoor adjustment.
715    ///
716    /// Returns Σ_z P(Y=1 | X=x, Z=z) * P(Z=z).
717    pub fn adjust(&self) -> f64 {
718        self.cond_probs
719            .iter()
720            .zip(self.stratum_probs.iter())
721            .map(|(p, q)| p * q)
722            .sum()
723    }
724    /// Compute the confounding bias: |observational P(Y|X=x) - interventional P(Y|do(X=x))|.
725    pub fn confounding_bias(&self, observational: f64) -> f64 {
726        (observational - self.adjust()).abs()
727    }
728    /// Verify the backdoor adjustment probabilities sum to ≤ 1 (sanity check).
729    pub fn is_valid(&self) -> bool {
730        let sum: f64 = self.stratum_probs.iter().sum();
731        (sum - 1.0).abs() < 1e-6
732    }
733}
734/// Double Rademacher (two-sided) bound.
735pub struct DoubleRademacher {
736    /// Rademacher complexity instance.
737    pub rademacher: RademacherComplexity,
738}
739impl DoubleRademacher {
740    /// Create a new DoubleRademacher instance.
741    pub fn new(n: usize, class_size: usize) -> Self {
742        Self {
743            rademacher: RademacherComplexity::new(n, class_size),
744        }
745    }
746    /// Two-sided bound: |L_D(h) - L_S(h)| ≤ 2 R_n(H) w.h.p.
747    pub fn two_sided_bound(&self) -> f64 {
748        2.0 * self.rademacher.upper_bound()
749    }
750}
751/// Online Perceptron classifier.
752pub struct Perceptron {
753    /// Weight vector w ∈ ℝ^d.
754    pub weights: Vec<f64>,
755    /// Bias term b.
756    pub bias: f64,
757    /// Number of mistakes made so far.
758    pub mistakes: usize,
759}
760impl Perceptron {
761    /// Create a new zero-initialized Perceptron.
762    pub fn new(dim: usize) -> Self {
763        Self {
764            weights: vec![0.0; dim],
765            bias: 0.0,
766            mistakes: 0,
767        }
768    }
769    /// Predict the label for input x: sign(w·x + b).
770    pub fn predict(&self, x: &[f64]) -> f64 {
771        let score = dot(&self.weights, x) + self.bias;
772        if score >= 0.0 {
773            1.0
774        } else {
775            -1.0
776        }
777    }
778    /// Online update: if prediction wrong, w ← w + y·x, b ← b + y.
779    pub fn update(&mut self, x: &[f64], label: f64) -> bool {
780        let pred = self.predict(x);
781        if (pred * label) <= 0.0 {
782            for (wi, &xi) in self.weights.iter_mut().zip(x.iter()) {
783                *wi += label * xi;
784            }
785            self.bias += label;
786            self.mistakes += 1;
787            true
788        } else {
789            false
790        }
791    }
792    /// Perceptron mistake bound: M ≤ (R/γ)² where R = radius, γ = margin.
793    pub fn mistake_bound(radius: f64, margin: f64) -> usize {
794        ((radius / margin).powi(2)).ceil() as usize
795    }
796}
797/// Gaussian complexity (Gaussian analog of Rademacher).
798pub struct GaussianComplexity {
799    /// Number of samples n.
800    pub n: usize,
801    /// Number of hypotheses in the class.
802    pub class_size: usize,
803}
804impl GaussianComplexity {
805    /// Create a new GaussianComplexity instance.
806    pub fn new(n: usize, class_size: usize) -> Self {
807        Self { n, class_size }
808    }
809    /// Upper bound for Gaussian complexity: √(2 ln|H| / n) (same as Rademacher by comparison).
810    pub fn upper_bound(&self) -> f64 {
811        (2.0 * (self.class_size as f64).ln() / self.n as f64).sqrt()
812    }
813}
814#[allow(dead_code)]
815#[derive(Debug, Clone, PartialEq)]
816pub enum GBLoss {
817    MSE,
818    MAE,
819    LogLoss,
820    Huber(f64),
821}
822/// Lasso (ℓ₁) regularization: min_h L(h) + λ‖h‖₁.
823pub struct LassoReg {
824    /// Regularization parameter λ.
825    pub lambda: f64,
826}
827impl LassoReg {
828    /// Create a new LassoReg.
829    pub fn new(lambda: f64) -> Self {
830        Self { lambda }
831    }
832    /// Soft-thresholding operator: sign(w) * max(|w| - λ, 0) per coordinate.
833    pub fn soft_threshold(&self, w: &[f64]) -> Vec<f64> {
834        w.iter()
835            .map(|&wi| {
836                let abs_wi = wi.abs();
837                if abs_wi <= self.lambda {
838                    0.0
839                } else {
840                    wi.signum() * (abs_wi - self.lambda)
841                }
842            })
843            .collect()
844    }
845    /// Regularization penalty: λ‖w‖₁.
846    pub fn penalty(&self, w: &[f64]) -> f64 {
847        self.lambda * w.iter().map(|wi| wi.abs()).sum::<f64>()
848    }
849}
850/// UCB1 (Upper Confidence Bound) algorithm for multi-armed bandits.
851///
852/// Achieves cumulative regret O(√(n T ln T)) where n = number of arms.
853pub struct UCBBandit {
854    /// Number of arms.
855    pub n: usize,
856    /// Number of times each arm has been pulled.
857    pub counts: Vec<usize>,
858    /// Empirical mean reward for each arm.
859    pub means: Vec<f64>,
860    /// Total rounds elapsed.
861    pub t: usize,
862}
863impl UCBBandit {
864    /// Create a new UCB1 bandit instance.
865    pub fn new(n: usize) -> Self {
866        Self {
867            n,
868            counts: vec![0; n],
869            means: vec![0.0; n],
870            t: 0,
871        }
872    }
873    /// Select the arm with the highest UCB index.
874    ///
875    /// UCB index for arm i: μ_i + √(2 ln t / n_i).
876    /// Arms with count 0 are always selected first (infinite UCB).
877    pub fn select(&self) -> usize {
878        if let Some(i) = self.counts.iter().position(|&c| c == 0) {
879            return i;
880        }
881        let ln_t = (self.t as f64).ln();
882        (0..self.n)
883            .max_by(|&i, &j| {
884                let ucb_i = self.means[i] + (2.0 * ln_t / self.counts[i] as f64).sqrt();
885                let ucb_j = self.means[j] + (2.0 * ln_t / self.counts[j] as f64).sqrt();
886                ucb_i
887                    .partial_cmp(&ucb_j)
888                    .unwrap_or(std::cmp::Ordering::Equal)
889            })
890            .unwrap_or(0)
891    }
892    /// Update the chosen arm with observed reward.
893    pub fn update(&mut self, arm: usize, reward: f64) {
894        self.counts[arm] += 1;
895        let n = self.counts[arm] as f64;
896        self.means[arm] += (reward - self.means[arm]) / n;
897        self.t += 1;
898    }
899    /// UCB1 regret bound: O(√(n T ln T)).
900    pub fn regret_bound_upper(&self) -> f64 {
901        let t = self.t as f64;
902        let n = self.n as f64;
903        (n * t * t.ln()).sqrt()
904    }
905}
906/// PAC-Bayes generalization bound computation.
907///
908/// McAllester's bound: L_D(Q) ≤ L_S(Q) + √((KL(Q‖P) + ln(n/δ)) / (2n)).
909pub struct PACBayesGeneralization {
910    /// KL divergence KL(Q‖P) in nats.
911    pub kl_qp: f64,
912    /// Number of training samples n.
913    pub n: usize,
914    /// Confidence parameter δ.
915    pub delta: f64,
916}
917impl PACBayesGeneralization {
918    /// Create a new PAC-Bayes bound instance.
919    pub fn new(kl_qp: f64, n: usize, delta: f64) -> Self {
920        Self { kl_qp, n, delta }
921    }
922    /// McAllester bound: empirical_loss + √((KL + ln(n/δ)) / (2n)).
923    pub fn mcallester_bound(&self, empirical_loss: f64) -> f64 {
924        let penalty =
925            ((self.kl_qp + (self.n as f64 / self.delta).ln()) / (2.0 * self.n as f64)).sqrt();
926        empirical_loss + penalty
927    }
928    /// Catoni's tighter bound (using solve for λ-parameterized form).
929    /// Approximation: L_D(Q) ≤ (1/(1-λ/2)) * (L_S(Q) + KL/(2λn)).
930    pub fn catoni_bound(&self, empirical_loss: f64, lambda: f64) -> f64 {
931        let scale = 1.0 / (1.0 - lambda / 2.0);
932        let penalty = self.kl_qp / (2.0 * lambda * self.n as f64);
933        scale * (empirical_loss + penalty)
934    }
935    /// Optimal λ for Catoni bound (minimizing RHS).
936    pub fn optimal_lambda(&self, empirical_loss: f64) -> f64 {
937        let ratio = self.n as f64 * empirical_loss / (self.kl_qp.max(1e-9));
938        1.0 / (1.0 + ratio).sqrt()
939    }
940}
941/// Exponential Weights Algorithm (Hedge / EWA) for online learning.
942///
943/// Maintains a distribution over n experts and uses multiplicative updates.
944/// Guarantees regret R_T ≤ ln(n)/η + η T/2.
945pub struct ExponentialWeightsAlgorithm {
946    /// Number of experts.
947    pub n: usize,
948    /// Learning rate η.
949    pub eta: f64,
950    /// Current weights (unnormalized).
951    pub weights: Vec<f64>,
952    /// Total rounds elapsed.
953    pub rounds: usize,
954}
955impl ExponentialWeightsAlgorithm {
956    /// Create a new EWA with uniform initial weights.
957    pub fn new(n: usize, eta: f64) -> Self {
958        Self {
959            n,
960            eta,
961            weights: vec![1.0; n],
962            rounds: 0,
963        }
964    }
965    /// Return the current probability distribution over experts.
966    pub fn distribution(&self) -> Vec<f64> {
967        let total: f64 = self.weights.iter().sum();
968        self.weights.iter().map(|w| w / total).collect()
969    }
970    /// Multiplicative update: w_i ← w_i * exp(-η * loss_i).
971    pub fn update(&mut self, losses: &[f64]) {
972        for (w, &l) in self.weights.iter_mut().zip(losses.iter()) {
973            *w *= (-self.eta * l).exp();
974        }
975        self.rounds += 1;
976    }
977    /// EWA regret bound: R_T ≤ ln(n)/η + η * T / 2.
978    pub fn regret_bound(&self) -> f64 {
979        let t = self.rounds as f64;
980        (self.n as f64).ln() / self.eta + self.eta * t / 2.0
981    }
982    /// Optimal learning rate for T rounds: η* = √(2 ln n / T).
983    pub fn optimal_eta(n: usize, t: usize) -> f64 {
984        (2.0 * (n as f64).ln() / t as f64).sqrt()
985    }
986}
987/// Feature map: maps inputs to a (truncated) explicit feature space.
988pub struct FeatureMap {
989    /// Dimensionality of the feature space.
990    pub feature_dim: usize,
991}
992impl FeatureMap {
993    /// Create a new feature map.
994    pub fn new(feature_dim: usize) -> Self {
995        Self { feature_dim }
996    }
997    /// Compute the inner product ⟨φ(x), φ(y)⟩ in feature space.
998    /// For the linear kernel, φ(x) = x, so this is just the dot product.
999    pub fn inner_product(&self, x: &[f64], y: &[f64]) -> f64 {
1000        dot(x, y)
1001    }
1002}
1003/// Kernel SVM dual representation.
1004pub struct KernelSVM {
1005    /// Dual variables α_i.
1006    pub alphas: Vec<f64>,
1007    /// Labels y_i ∈ {-1, +1}.
1008    pub labels: Vec<f64>,
1009    /// Bias term b.
1010    pub bias: f64,
1011    /// Regularization parameter C.
1012    pub c: f64,
1013}
1014impl KernelSVM {
1015    /// Create a new kernel SVM (initialized to zero weights).
1016    pub fn new(n: usize, c: f64) -> Self {
1017        Self {
1018            alphas: vec![0.0; n],
1019            labels: vec![1.0; n],
1020            bias: 0.0,
1021            c,
1022        }
1023    }
1024    /// Decision function: f(x) = Σ α_i y_i k(x_i, x) + b.
1025    pub fn predict(&self, kernel_vals: &[f64]) -> f64 {
1026        let sum: f64 = self
1027            .alphas
1028            .iter()
1029            .zip(self.labels.iter())
1030            .zip(kernel_vals.iter())
1031            .map(|((a, y), k)| a * y * k)
1032            .sum();
1033        sum + self.bias
1034    }
1035}
1036/// Gaussian process regression model.
1037#[allow(dead_code)]
1038#[derive(Debug, Clone)]
1039pub struct GaussianProcess {
1040    pub mean: f64,
1041    pub length_scale: f64,
1042    pub signal_variance: f64,
1043    pub noise_variance: f64,
1044}
1045#[allow(dead_code)]
1046impl GaussianProcess {
1047    pub fn new(mean: f64, length_scale: f64, signal_var: f64, noise_var: f64) -> Self {
1048        GaussianProcess {
1049            mean,
1050            length_scale,
1051            signal_variance: signal_var,
1052            noise_variance: noise_var,
1053        }
1054    }
1055    pub fn default_rbf() -> Self {
1056        GaussianProcess::new(0.0, 1.0, 1.0, 0.01)
1057    }
1058    /// RBF (squared exponential) kernel: k(x, x') = σ^2 exp(-|x-x'|^2 / (2l^2)).
1059    pub fn rbf_kernel(&self, x: f64, xp: f64) -> f64 {
1060        let d = x - xp;
1061        self.signal_variance * (-d * d / (2.0 * self.length_scale.powi(2))).exp()
1062    }
1063    /// Predictive variance at a new point (simplified: just signal variance).
1064    pub fn predictive_variance(&self, x: f64, train_x: &[f64]) -> f64 {
1065        let k_star_star = self.rbf_kernel(x, x);
1066        let k_noise: Vec<f64> = train_x.iter().map(|&xi| self.rbf_kernel(x, xi)).collect();
1067        let contrib: f64 = k_noise.iter().map(|&k| k * k).sum::<f64>()
1068            / (self.signal_variance + self.noise_variance).max(1e-10);
1069        (k_star_star - contrib).max(self.noise_variance)
1070    }
1071    pub fn log_marginal_likelihood_approx(&self, n: usize) -> f64 {
1072        -(n as f64) / 2.0 * (2.0 * std::f64::consts::PI * self.signal_variance).ln()
1073    }
1074}
1075/// A kernel function k: ℝ^d × ℝ^d → ℝ.
1076pub struct KernelFunction {
1077    /// Kernel type identifier.
1078    pub kernel_type: KernelType,
1079}
1080impl KernelFunction {
1081    /// Create a new kernel function.
1082    pub fn new(kernel_type: KernelType) -> Self {
1083        Self { kernel_type }
1084    }
1085    /// Evaluate the kernel k(x, y) where x, y are vectors in ℝ^d.
1086    pub fn evaluate(&self, x: &[f64], y: &[f64]) -> f64 {
1087        match &self.kernel_type {
1088            KernelType::Linear => dot(x, y),
1089            KernelType::Polynomial { degree, offset } => (dot(x, y) + offset).powi(*degree as i32),
1090            KernelType::Rbf { sigma } => {
1091                let diff_sq: f64 = x.iter().zip(y).map(|(a, b)| (a - b).powi(2)).sum();
1092                (-diff_sq / (2.0 * sigma * sigma)).exp()
1093            }
1094            KernelType::Laplace { sigma } => {
1095                let diff_norm: f64 = x.iter().zip(y).map(|(a, b)| (a - b).abs()).sum();
1096                (-diff_norm / sigma).exp()
1097            }
1098        }
1099    }
1100    /// Check if the kernel matrix for a set of points is positive semi-definite
1101    /// (via Cholesky: all pivots ≥ -ε for numerical tolerance).
1102    pub fn is_positive_definite(&self, points: &[Vec<f64>]) -> bool {
1103        let n = points.len();
1104        let mut k: Vec<Vec<f64>> = (0..n)
1105            .map(|i| {
1106                (0..n)
1107                    .map(|j| self.evaluate(&points[i], &points[j]))
1108                    .collect()
1109            })
1110            .collect();
1111        for i in 0..n {
1112            for j in 0..i {
1113                let mut sum = k[i][j];
1114                for l in 0..j {
1115                    sum -= k[i][l] * k[j][l];
1116                }
1117                k[i][j] = sum / k[j][j];
1118            }
1119            let mut diag = k[i][i];
1120            for l in 0..i {
1121                diag -= k[i][l] * k[i][l];
1122            }
1123            if diag < -1e-9 {
1124                return false;
1125            }
1126            k[i][i] = diag.max(0.0).sqrt();
1127        }
1128        true
1129    }
1130}