aprender-core 0.29.1

Next-generation machine learning library in pure Rust
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
//! Regularization techniques for neural network training.
//!
//! # Techniques
//! - Mixup: Interpolate between training samples
//! - Label Smoothing: Soft targets instead of hard labels
//! - `CutMix`: Cut and paste patches between images

use crate::primitives::Vector;
use rand::Rng;

/// Mixup data augmentation (Zhang et al., 2018).
/// Creates virtual training examples: x' = `λx_i` + (1-λ)x_j
#[derive(Debug, Clone)]
pub struct Mixup {
    alpha: f32,
}

impl Mixup {
    /// Create new Mixup with alpha parameter for Beta distribution.
    #[must_use]
    pub fn new(alpha: f32) -> Self {
        Self { alpha }
    }

    /// Sample mixing coefficient from Beta(alpha, alpha).
    #[must_use]
    pub fn sample_lambda(&self) -> f32 {
        if self.alpha <= 0.0 {
            return 1.0;
        }
        sample_beta(self.alpha, self.alpha)
    }

    /// Mix two samples: x' = λ*x1 + (1-λ)*x2
    #[must_use]
    pub fn mix_samples(&self, x1: &Vector<f32>, x2: &Vector<f32>, lambda: f32) -> Vector<f32> {
        let mixed: Vec<f32> = x1
            .as_slice()
            .iter()
            .zip(x2.as_slice().iter())
            .map(|(&a, &b)| lambda * a + (1.0 - lambda) * b)
            .collect();
        Vector::from_slice(&mixed)
    }

    /// Mix labels: y' = λ*y1 + (1-λ)*y2
    #[must_use]
    pub fn mix_labels(&self, y1: &Vector<f32>, y2: &Vector<f32>, lambda: f32) -> Vector<f32> {
        self.mix_samples(y1, y2, lambda)
    }

    #[must_use]
    pub fn alpha(&self) -> f32 {
        self.alpha
    }
}

/// Label smoothing for soft targets.
/// Converts hard labels to: (1-ε)y + ε/K
#[derive(Debug, Clone)]
pub struct LabelSmoothing {
    epsilon: f32,
}

impl LabelSmoothing {
    /// Create label smoothing with smoothing factor ε.
    #[must_use]
    pub fn new(epsilon: f32) -> Self {
        assert!((0.0..1.0).contains(&epsilon));
        Self { epsilon }
    }

    /// Smooth a one-hot label vector.
    #[must_use]
    pub fn smooth(&self, label: &Vector<f32>) -> Vector<f32> {
        let n_classes = label.len();
        let smoothed: Vec<f32> = label
            .as_slice()
            .iter()
            .map(|&y| (1.0 - self.epsilon) * y + self.epsilon / n_classes as f32)
            .collect();
        Vector::from_slice(&smoothed)
    }

    /// Create smoothed one-hot from class index.
    #[must_use]
    pub fn smooth_index(&self, class_idx: usize, n_classes: usize) -> Vector<f32> {
        let mut result = vec![self.epsilon / n_classes as f32; n_classes];
        result[class_idx] = 1.0 - self.epsilon + self.epsilon / n_classes as f32;
        Vector::from_slice(&result)
    }

    #[must_use]
    pub fn epsilon(&self) -> f32 {
        self.epsilon
    }
}

/// Cross-entropy loss with label smoothing.
///
/// ONE PATH: Uses `nn::functional::log_softmax_1d` for numerical stability (UCBD §4).
#[must_use]
pub fn cross_entropy_with_smoothing(logits: &Vector<f32>, target_idx: usize, epsilon: f32) -> f32 {
    let n_classes = logits.len();
    let log_probs = crate::nn::functional::log_softmax_1d(logits.as_slice());

    let mut loss = 0.0;
    for (i, &lp) in log_probs.iter().enumerate() {
        let target = if i == target_idx {
            1.0 - epsilon + epsilon / n_classes as f32
        } else {
            epsilon / n_classes as f32
        };
        loss -= target * lp;
    }
    loss
}

/// Sample from Beta distribution using Gamma samples.
fn sample_beta(alpha: f32, beta: f32) -> f32 {
    let mut rng = rand::rng();
    let x = sample_gamma(alpha, &mut rng);
    let y = sample_gamma(beta, &mut rng);
    let sum = x + y;
    // With extreme shape parameters (e.g. 0.01), f32 gamma samples can
    // underflow to 0.0, producing 0/0 = NaN. Return 0.5 in that case
    // (unbiased midpoint, correct for the symmetric alpha==beta case).
    if sum <= 0.0 {
        return 0.5;
    }
    (x / sum).clamp(0.0, 1.0)
}

fn sample_gamma(shape: f32, rng: &mut impl Rng) -> f32 {
    // Marsaglia and Tsang's method
    if shape < 1.0 {
        return sample_gamma(1.0 + shape, rng) * rng.random::<f32>().powf(1.0 / shape);
    }
    let d = shape - 1.0 / 3.0;
    let c = 1.0 / (9.0 * d).sqrt();
    loop {
        let x: f32 = sample_normal(rng);
        let v = (1.0 + c * x).powi(3);
        if v > 0.0 {
            let u: f32 = rng.random();
            if u < 1.0 - 0.0331 * x.powi(4) || u.ln() < 0.5 * x * x + d * (1.0 - v + v.ln()) {
                return d * v;
            }
        }
    }
}

fn sample_normal(rng: &mut impl Rng) -> f32 {
    let u1: f32 = rng.random::<f32>().max(1e-10);
    let u2: f32 = rng.random();
    (-2.0 * u1.ln()).sqrt() * (2.0 * std::f32::consts::PI * u2).cos()
}

/// `CutMix` data augmentation (Yun et al., 2019).
///
/// Cuts a rectangular region from one image and pastes onto another.
#[derive(Debug, Clone)]
pub struct CutMix {
    alpha: f32,
}

impl CutMix {
    #[must_use]
    pub fn new(alpha: f32) -> Self {
        Self { alpha }
    }

    /// Sample lambda and bounding box for cutmix.
    #[must_use]
    pub fn sample(&self, height: usize, width: usize) -> CutMixParams {
        // Alpha <= 0 means no mixing: lambda = 1.0, empty box
        if self.alpha <= 0.0 {
            return CutMixParams {
                lambda: 1.0,
                x1: 0,
                y1: 0,
                x2: 0,
                y2: 0,
            };
        }
        let lambda = sample_beta(self.alpha, self.alpha);

        // Sample bounding box
        let ratio = (1.0 - lambda).sqrt();
        let rh = (height as f32 * ratio) as usize;
        let rw = (width as f32 * ratio) as usize;

        let mut rng = rand::rng();
        let cx = rng.random_range(0..width);
        let cy = rng.random_range(0..height);

        let x1 = cx.saturating_sub(rw / 2);
        let y1 = cy.saturating_sub(rh / 2);
        let x2 = (cx + rw / 2).min(width);
        let y2 = (cy + rh / 2).min(height);

        // Actual lambda based on box area
        let actual_lambda = 1.0 - ((x2 - x1) * (y2 - y1)) as f32 / (height * width) as f32;

        CutMixParams {
            lambda: actual_lambda,
            x1,
            y1,
            x2,
            y2,
        }
    }

    #[must_use]
    pub fn alpha(&self) -> f32 {
        self.alpha
    }
}

/// Parameters for a `CutMix` operation.
#[derive(Debug, Clone)]
pub struct CutMixParams {
    pub lambda: f32,
    pub x1: usize,
    pub y1: usize,
    pub x2: usize,
    pub y2: usize,
}

impl CutMixParams {
    /// Apply cutmix to two flat image vectors `[C*H*W]`.
    #[must_use]
    pub fn apply(
        &self,
        img1: &[f32],
        img2: &[f32],
        channels: usize,
        height: usize,
        width: usize,
    ) -> Vec<f32> {
        let mut result = img1.to_vec();

        for c in 0..channels {
            for y in self.y1..self.y2 {
                for x in self.x1..self.x2 {
                    let idx = c * height * width + y * width + x;
                    if idx < result.len() {
                        result[idx] = img2[idx];
                    }
                }
            }
        }
        result
    }
}

/// Stochastic Depth (Huang et al., 2016).
///
/// Randomly drops entire residual blocks during training.
#[derive(Debug, Clone)]
pub struct StochasticDepth {
    drop_prob: f32,
    mode: DropMode,
}

#[derive(Debug, Clone, Copy, PartialEq)]
pub enum DropMode {
    /// All samples in batch are dropped together
    Batch,
    /// Each sample dropped independently
    Row,
}

impl StochasticDepth {
    #[must_use]
    pub fn new(drop_prob: f32, mode: DropMode) -> Self {
        assert!((0.0..1.0).contains(&drop_prob));
        Self { drop_prob, mode }
    }

    /// Apply stochastic depth: returns true if should keep (not drop).
    #[must_use]
    pub fn should_keep(&self, training: bool) -> bool {
        if !training || self.drop_prob == 0.0 {
            return true;
        }
        rand::rng().random::<f32>() >= self.drop_prob
    }

    /// Compute survival probability for linear decay schedule.
    #[must_use]
    pub fn linear_decay(depth: usize, total_depth: usize, max_drop: f32) -> f32 {
        1.0 - (depth as f32 / total_depth as f32) * max_drop
    }

    #[must_use]
    pub fn drop_prob(&self) -> f32 {
        self.drop_prob
    }

    #[must_use]
    pub fn mode(&self) -> DropMode {
        self.mode
    }
}

/// R-Drop regularization (Liang et al., 2021).
///
/// Forces consistency between two forward passes with different dropout masks.
/// Adds bidirectional KL divergence loss between the two outputs.
///
/// Loss = CE(p1, y) + CE(p2, y) + α * (KL(p1||p2) + KL(p2||p1)) / 2
///
/// # Reference
/// Liang, X., et al. (2021). R-Drop: Regularized Dropout for Neural Networks.
#[derive(Debug, Clone)]
pub struct RDrop {
    alpha: f32,
}

impl RDrop {
    /// Create R-Drop with regularization weight alpha.
    #[must_use]
    pub fn new(alpha: f32) -> Self {
        assert!(alpha >= 0.0, "Alpha must be non-negative");
        Self { alpha }
    }

    #[must_use]
    pub fn alpha(&self) -> f32 {
        self.alpha
    }

    /// Compute KL divergence: KL(p || q) = Σ p * log(p / q)
    #[must_use]
    pub fn kl_divergence(&self, p: &[f32], q: &[f32]) -> f32 {
        assert_eq!(p.len(), q.len());
        let eps = 1e-10;
        p.iter()
            .zip(q.iter())
            .map(|(&pi, &qi)| {
                let pi = pi.max(eps);
                let qi = qi.max(eps);
                pi * (pi / qi).ln()
            })
            .sum()
    }

    /// Compute bidirectional KL divergence (symmetric).
    #[must_use]
    pub fn symmetric_kl(&self, p: &[f32], q: &[f32]) -> f32 {
        f32::midpoint(self.kl_divergence(p, q), self.kl_divergence(q, p))
    }

    /// Compute R-Drop regularization loss between two forward passes.
    #[must_use]
    pub fn compute_loss(&self, logits1: &[f32], logits2: &[f32]) -> f32 {
        let p1 = softmax_slice(logits1);
        let p2 = softmax_slice(logits2);
        self.alpha * self.symmetric_kl(&p1, &p2)
    }
}

/// ONE PATH: Delegates to `nn::functional::softmax_1d` (UCBD §4).
fn softmax_slice(logits: &[f32]) -> Vec<f32> {
    crate::nn::functional::softmax_1d(logits)
}

/// `SpecAugment`: Data augmentation for speech recognition (Park et al., 2019).
///
/// Applies time warping, frequency masking, and time masking to spectrograms.
///
/// # Methods
///
/// - **Time Warping**: Warps spectrogram along time axis
/// - **Frequency Masking**: Masks F consecutive frequency channels
/// - **Time Masking**: Masks T consecutive time steps
///
/// # Reference
///
/// - Park, D., et al. (2019). `SpecAugment`: A Simple Data Augmentation Method
///   for Automatic Speech Recognition. Interspeech.
#[derive(Debug, Clone)]
pub struct SpecAugment {
    /// Number of frequency masks to apply
    num_freq_masks: usize,
    /// Maximum size of frequency mask
    freq_mask_param: usize,
    /// Number of time masks to apply
    num_time_masks: usize,
    /// Maximum size of time mask
    time_mask_param: usize,
    /// Mask value (usually 0 or mean)
    mask_value: f32,
}

impl Default for SpecAugment {
    fn default() -> Self {
        Self::new()
    }
}

mod specaugment;
pub use specaugment::*;