tensorlogic_train/
stochastic_depth.rs

1//! Stochastic Depth (DropPath) for deep networks.
2//!
3//! Implements Drop-Path regularization, a technique that randomly drops entire
4//! residual paths during training. This is particularly effective for very deep
5//! networks and is widely used in Vision Transformers and ResNets.
6//!
7//! # References
8//!
9//! - Huang, G., Sun, Y., Liu, Z., Sedra, D., & Weinberger, K. Q. (2016).
10//!   "Deep Networks with Stochastic Depth". ECCV 2016.
11//!   <https://arxiv.org/abs/1603.09382>
12//!
13//! - Widely used in:
14//!   - Vision Transformers (ViT, DeiT, Swin)
15//!   - EfficientNet
16//!   - ResNets and variants
17//!
18//! # Key Concepts
19//!
20//! **DropPath vs Dropout**:
21//! - Dropout: Randomly zeros individual neurons
22//! - DropPath: Randomly zeros entire paths/blocks
23//!
24//! **Usage**: Applied to residual connections:
25//! ```text
26//! output = x + DropPath(F(x))
27//! ```
28//!
29//! **Linear Scheduling**: Drop probability increases with depth:
30//! ```text
31//! drop_prob(layer_i) = drop_prob_min + (drop_prob_max - drop_prob_min) * i / (L-1)
32//! ```
33
34use crate::{TrainError, TrainResult};
35use scirs2_core::ndarray::{Array2, ArrayView2};
36use scirs2_core::random::{Rng, StdRng};
37
38/// DropPath (Stochastic Depth) regularization.
39///
40/// Randomly drops entire paths in residual networks during training.
41/// At test time, paths are always kept but scaled by (1 - drop_prob).
42///
43/// # Example
44///
45/// ```rust
46/// use tensorlogic_train::DropPath;
47/// use scirs2_core::ndarray::Array2;
48/// use scirs2_core::random::{StdRng, SeedableRng};
49///
50/// let drop_path = DropPath::new(0.1).unwrap(); // 10% drop probability
51/// let mut rng = StdRng::seed_from_u64(42);
52///
53/// let residual = Array2::ones((4, 8));
54///
55/// // Training mode: randomly drop
56/// let output = drop_path.apply(&residual.view(), true, &mut rng).unwrap();
57///
58/// // Inference mode: scale by keep probability
59/// let output_test = drop_path.apply(&residual.view(), false, &mut rng).unwrap();
60/// ```
61#[derive(Debug, Clone)]
62pub struct DropPath {
63    /// Probability of dropping the path (0.0 to 1.0).
64    pub drop_prob: f64,
65    /// Keep probability (1.0 - drop_prob).
66    keep_prob: f64,
67}
68
69impl DropPath {
70    /// Create a new DropPath regularizer.
71    ///
72    /// # Arguments
73    ///
74    /// * `drop_prob` - Probability of dropping the path (0.0 to 1.0)
75    ///
76    /// # Returns
77    ///
78    /// A new DropPath instance or error if drop_prob is invalid.
79    pub fn new(drop_prob: f64) -> TrainResult<Self> {
80        if !(0.0..=1.0).contains(&drop_prob) {
81            return Err(TrainError::InvalidParameter(
82                "drop_prob must be in [0, 1]".to_string(),
83            ));
84        }
85
86        Ok(Self {
87            drop_prob,
88            keep_prob: 1.0 - drop_prob,
89        })
90    }
91
92    /// Apply DropPath to a residual path.
93    ///
94    /// # Arguments
95    ///
96    /// * `path` - The residual path to potentially drop
97    /// * `training` - Whether in training mode (drop) or inference mode (scale)
98    /// * `rng` - Random number generator
99    ///
100    /// # Returns
101    ///
102    /// Transformed path (either dropped or scaled)
103    pub fn apply(
104        &self,
105        path: &ArrayView2<f64>,
106        training: bool,
107        rng: &mut StdRng,
108    ) -> TrainResult<Array2<f64>> {
109        if !training || self.drop_prob == 0.0 {
110            // Inference mode or no dropout: just return the path
111            return Ok(path.to_owned());
112        }
113
114        if self.drop_prob == 1.0 {
115            // Always drop: return zeros
116            return Ok(Array2::zeros(path.raw_dim()));
117        }
118
119        // Training mode: randomly drop the entire path
120        let should_drop = rng.random::<f64>() < self.drop_prob;
121
122        if should_drop {
123            // Drop the path (return zeros)
124            Ok(Array2::zeros(path.raw_dim()))
125        } else {
126            // Keep the path but scale by 1/keep_prob to maintain expected value
127            // This is the "inverted dropout" technique
128            Ok(path.mapv(|x| x / self.keep_prob))
129        }
130    }
131
132    /// Apply DropPath to a batch of paths.
133    ///
134    /// Each sample in the batch is independently dropped with probability drop_prob.
135    ///
136    /// # Arguments
137    ///
138    /// * `paths` - Batch of residual paths (batch_size × features)
139    /// * `training` - Whether in training mode
140    /// * `rng` - Random number generator
141    ///
142    /// # Returns
143    ///
144    /// Batch with randomly dropped paths
145    pub fn apply_batch(
146        &self,
147        paths: &ArrayView2<f64>,
148        training: bool,
149        rng: &mut StdRng,
150    ) -> TrainResult<Array2<f64>> {
151        if !training || self.drop_prob == 0.0 {
152            return Ok(paths.to_owned());
153        }
154
155        let (batch_size, _) = paths.dim();
156        let mut output = paths.to_owned();
157
158        // Independently drop each sample in batch
159        for i in 0..batch_size {
160            let should_drop = rng.random::<f64>() < self.drop_prob;
161            if should_drop {
162                // Zero out this sample
163                for j in 0..output.ncols() {
164                    output[[i, j]] = 0.0;
165                }
166            } else {
167                // Scale by 1/keep_prob
168                for j in 0..output.ncols() {
169                    output[[i, j]] /= self.keep_prob;
170                }
171            }
172        }
173
174        Ok(output)
175    }
176
177    /// Get the keep probability.
178    pub fn keep_probability(&self) -> f64 {
179        self.keep_prob
180    }
181
182    /// Set new drop probability.
183    pub fn set_drop_prob(&mut self, drop_prob: f64) -> TrainResult<()> {
184        if !(0.0..=1.0).contains(&drop_prob) {
185            return Err(TrainError::InvalidParameter(
186                "drop_prob must be in [0, 1]".to_string(),
187            ));
188        }
189
190        self.drop_prob = drop_prob;
191        self.keep_prob = 1.0 - drop_prob;
192        Ok(())
193    }
194}
195
196/// Linear stochastic depth scheduler.
197///
198/// Linearly increases drop probability from min to max across network depth.
199/// This is the standard approach used in most deep networks:
200/// - Shallow layers: low drop probability (more stable)
201/// - Deep layers: high drop probability (more regularization)
202///
203/// # Example
204///
205/// ```no_run
206/// use tensorlogic_train::LinearStochasticDepth;
207///
208/// // 10 layers, drop_prob from 0.0 to 0.3
209/// let scheduler = LinearStochasticDepth::new(10, 0.0, 0.3).unwrap();
210///
211/// // Get drop probability for layer 5
212/// let drop_prob_5 = scheduler.get_drop_prob(5);
213/// assert!((drop_prob_5 - 0.15).abs() < 1e-6); // Halfway point
214/// ```
215#[derive(Debug, Clone)]
216pub struct LinearStochasticDepth {
217    /// Total number of layers/blocks.
218    pub num_layers: usize,
219    /// Minimum drop probability (first layer).
220    pub drop_prob_min: f64,
221    /// Maximum drop probability (last layer).
222    pub drop_prob_max: f64,
223}
224
225impl LinearStochasticDepth {
226    /// Create a new linear stochastic depth scheduler.
227    ///
228    /// # Arguments
229    ///
230    /// * `num_layers` - Total number of layers in the network
231    /// * `drop_prob_min` - Drop probability for first layer (usually 0.0)
232    /// * `drop_prob_max` - Drop probability for last layer (e.g., 0.1-0.5)
233    ///
234    /// # Returns
235    ///
236    /// A new scheduler or error if parameters are invalid.
237    pub fn new(num_layers: usize, drop_prob_min: f64, drop_prob_max: f64) -> TrainResult<Self> {
238        if num_layers == 0 {
239            return Err(TrainError::InvalidParameter(
240                "num_layers must be > 0".to_string(),
241            ));
242        }
243
244        if !(0.0..=1.0).contains(&drop_prob_min) || !(0.0..=1.0).contains(&drop_prob_max) {
245            return Err(TrainError::InvalidParameter(
246                "drop probabilities must be in [0, 1]".to_string(),
247            ));
248        }
249
250        if drop_prob_min > drop_prob_max {
251            return Err(TrainError::InvalidParameter(
252                "drop_prob_min must be <= drop_prob_max".to_string(),
253            ));
254        }
255
256        Ok(Self {
257            num_layers,
258            drop_prob_min,
259            drop_prob_max,
260        })
261    }
262
263    /// Get drop probability for a specific layer.
264    ///
265    /// Uses linear interpolation:
266    /// ```text
267    /// drop_prob(i) = drop_prob_min + (drop_prob_max - drop_prob_min) * i / (L-1)
268    /// ```
269    ///
270    /// # Arguments
271    ///
272    /// * `layer_idx` - Layer index (0 to num_layers-1)
273    ///
274    /// # Returns
275    ///
276    /// Drop probability for this layer.
277    pub fn get_drop_prob(&self, layer_idx: usize) -> f64 {
278        if layer_idx >= self.num_layers {
279            return self.drop_prob_max;
280        }
281
282        if self.num_layers == 1 {
283            return self.drop_prob_min;
284        }
285
286        // Linear interpolation
287        let ratio = layer_idx as f64 / (self.num_layers - 1) as f64;
288        self.drop_prob_min + (self.drop_prob_max - self.drop_prob_min) * ratio
289    }
290
291    /// Create DropPath instances for all layers.
292    ///
293    /// # Returns
294    ///
295    /// Vector of DropPath instances with linearly increasing drop probabilities.
296    pub fn create_drop_paths(&self) -> TrainResult<Vec<DropPath>> {
297        let mut drop_paths = Vec::with_capacity(self.num_layers);
298
299        for i in 0..self.num_layers {
300            let drop_prob = self.get_drop_prob(i);
301            drop_paths.push(DropPath::new(drop_prob)?);
302        }
303
304        Ok(drop_paths)
305    }
306}
307
308/// Exponential stochastic depth scheduler.
309///
310/// Exponentially increases drop probability across network depth.
311/// Provides more aggressive regularization in deeper layers.
312#[derive(Debug, Clone)]
313pub struct ExponentialStochasticDepth {
314    /// Total number of layers/blocks.
315    pub num_layers: usize,
316    /// Drop probability for first layer.
317    pub drop_prob_min: f64,
318    /// Drop probability for last layer.
319    pub drop_prob_max: f64,
320}
321
322impl ExponentialStochasticDepth {
323    /// Create a new exponential stochastic depth scheduler.
324    pub fn new(num_layers: usize, drop_prob_min: f64, drop_prob_max: f64) -> TrainResult<Self> {
325        if num_layers == 0 {
326            return Err(TrainError::InvalidParameter(
327                "num_layers must be > 0".to_string(),
328            ));
329        }
330
331        if !(0.0..=1.0).contains(&drop_prob_min) || !(0.0..=1.0).contains(&drop_prob_max) {
332            return Err(TrainError::InvalidParameter(
333                "drop probabilities must be in [0, 1]".to_string(),
334            ));
335        }
336
337        if drop_prob_min > drop_prob_max {
338            return Err(TrainError::InvalidParameter(
339                "drop_prob_min must be <= drop_prob_max".to_string(),
340            ));
341        }
342
343        Ok(Self {
344            num_layers,
345            drop_prob_min,
346            drop_prob_max,
347        })
348    }
349
350    /// Get drop probability for a specific layer using exponential interpolation.
351    pub fn get_drop_prob(&self, layer_idx: usize) -> f64 {
352        if layer_idx >= self.num_layers {
353            return self.drop_prob_max;
354        }
355
356        if self.num_layers == 1 {
357            return self.drop_prob_min;
358        }
359
360        // Exponential interpolation: use power of 2 for smooth curve
361        let ratio = layer_idx as f64 / (self.num_layers - 1) as f64;
362        let exp_ratio = ratio * ratio; // Quadratic for exponential effect
363
364        self.drop_prob_min + (self.drop_prob_max - self.drop_prob_min) * exp_ratio
365    }
366
367    /// Create DropPath instances for all layers.
368    pub fn create_drop_paths(&self) -> TrainResult<Vec<DropPath>> {
369        let mut drop_paths = Vec::with_capacity(self.num_layers);
370
371        for i in 0..self.num_layers {
372            let drop_prob = self.get_drop_prob(i);
373            drop_paths.push(DropPath::new(drop_prob)?);
374        }
375
376        Ok(drop_paths)
377    }
378}
379
380#[cfg(test)]
381mod tests {
382    use super::*;
383    use scirs2_core::ndarray::array;
384    use scirs2_core::random::SeedableRng;
385
386    fn create_test_rng() -> StdRng {
387        StdRng::seed_from_u64(42)
388    }
389
390    #[test]
391    fn test_drop_path_creation() {
392        let dp = DropPath::new(0.2).unwrap();
393        assert_eq!(dp.drop_prob, 0.2);
394        assert!((dp.keep_prob - 0.8).abs() < 1e-10);
395    }
396
397    #[test]
398    fn test_drop_path_invalid_prob() {
399        assert!(DropPath::new(-0.1).is_err());
400        assert!(DropPath::new(1.5).is_err());
401    }
402
403    #[test]
404    fn test_drop_path_zero_prob() {
405        let dp = DropPath::new(0.0).unwrap();
406        let mut rng = create_test_rng();
407
408        let path = array![[1.0, 2.0], [3.0, 4.0]];
409
410        // With 0% drop prob, path should be unchanged
411        let output = dp.apply(&path.view(), true, &mut rng).unwrap();
412        assert_eq!(output, path);
413    }
414
415    #[test]
416    fn test_drop_path_full_prob() {
417        let dp = DropPath::new(1.0).unwrap();
418        let mut rng = create_test_rng();
419
420        let path = array![[1.0, 2.0], [3.0, 4.0]];
421
422        // With 100% drop prob, should return zeros
423        let output = dp.apply(&path.view(), true, &mut rng).unwrap();
424        assert_eq!(output, Array2::<f64>::zeros((2, 2)));
425    }
426
427    #[test]
428    fn test_drop_path_inference_mode() {
429        let dp = DropPath::new(0.5).unwrap();
430        let mut rng = create_test_rng();
431
432        let path = array![[1.0, 2.0], [3.0, 4.0]];
433
434        // In inference mode (training=false), path should be unchanged
435        let output = dp.apply(&path.view(), false, &mut rng).unwrap();
436        assert_eq!(output, path);
437    }
438
439    #[test]
440    fn test_drop_path_training_mode() {
441        let dp = DropPath::new(0.5).unwrap();
442        let mut rng = create_test_rng();
443
444        let path = array![[1.0, 2.0]];
445
446        // Run multiple times to check stochastic behavior
447        let mut dropped_count = 0;
448        let mut kept_count = 0;
449
450        for _ in 0..100 {
451            let output = dp.apply(&path.view(), true, &mut rng).unwrap();
452
453            if output[[0, 0]] == 0.0 {
454                dropped_count += 1;
455            } else {
456                kept_count += 1;
457                // When kept, should be scaled by 1/keep_prob = 2.0
458                assert!((output[[0, 0]] - 2.0).abs() < 1e-10);
459            }
460        }
461
462        // With 50% drop prob, should drop roughly half the time
463        assert!(dropped_count > 30 && dropped_count < 70);
464        assert!(kept_count > 30 && kept_count < 70);
465    }
466
467    #[test]
468    fn test_drop_path_batch() {
469        let dp = DropPath::new(0.5).unwrap();
470        let mut rng = create_test_rng();
471
472        let paths = array![[1.0, 2.0], [3.0, 4.0], [5.0, 6.0], [7.0, 8.0]];
473
474        let output = dp.apply_batch(&paths.view(), true, &mut rng).unwrap();
475
476        // Shape should be preserved
477        assert_eq!(output.shape(), paths.shape());
478
479        // Some rows should be dropped (all zeros), others kept and scaled
480        let mut dropped_rows = 0;
481        for i in 0..output.nrows() {
482            if output[[i, 0]] == 0.0 && output[[i, 1]] == 0.0 {
483                dropped_rows += 1;
484            }
485        }
486
487        // With 50% drop prob and 4 rows, expect ~2 dropped
488        assert!(dropped_rows > 0);
489    }
490
491    #[test]
492    fn test_drop_path_set_prob() {
493        let mut dp = DropPath::new(0.2).unwrap();
494        assert_eq!(dp.drop_prob, 0.2);
495
496        dp.set_drop_prob(0.5).unwrap();
497        assert_eq!(dp.drop_prob, 0.5);
498        assert!((dp.keep_prob - 0.5).abs() < 1e-10);
499
500        // Invalid probability
501        assert!(dp.set_drop_prob(1.5).is_err());
502    }
503
504    #[test]
505    fn test_linear_stochastic_depth_creation() {
506        let scheduler = LinearStochasticDepth::new(10, 0.0, 0.5).unwrap();
507        assert_eq!(scheduler.num_layers, 10);
508        assert_eq!(scheduler.drop_prob_min, 0.0);
509        assert_eq!(scheduler.drop_prob_max, 0.5);
510    }
511
512    #[test]
513    fn test_linear_stochastic_depth_invalid() {
514        assert!(LinearStochasticDepth::new(0, 0.0, 0.5).is_err());
515        assert!(LinearStochasticDepth::new(10, -0.1, 0.5).is_err());
516        assert!(LinearStochasticDepth::new(10, 0.0, 1.5).is_err());
517        assert!(LinearStochasticDepth::new(10, 0.6, 0.3).is_err());
518    }
519
520    #[test]
521    fn test_linear_stochastic_depth_interpolation() {
522        let scheduler = LinearStochasticDepth::new(10, 0.0, 0.9).unwrap();
523
524        // First layer: min drop prob
525        assert!((scheduler.get_drop_prob(0) - 0.0).abs() < 1e-10);
526
527        // Middle layer: halfway
528        assert!((scheduler.get_drop_prob(5) - 0.5).abs() < 1e-6);
529
530        // Last layer: max drop prob
531        assert!((scheduler.get_drop_prob(9) - 0.9).abs() < 1e-10);
532    }
533
534    #[test]
535    fn test_linear_stochastic_depth_create_paths() {
536        let scheduler = LinearStochasticDepth::new(5, 0.0, 0.4).unwrap();
537        let paths = scheduler.create_drop_paths().unwrap();
538
539        assert_eq!(paths.len(), 5);
540
541        // Check drop probabilities increase linearly
542        assert!((paths[0].drop_prob - 0.0).abs() < 1e-10);
543        assert!((paths[2].drop_prob - 0.2).abs() < 1e-10);
544        assert!((paths[4].drop_prob - 0.4).abs() < 1e-10);
545    }
546
547    #[test]
548    fn test_exponential_stochastic_depth() {
549        let scheduler = ExponentialStochasticDepth::new(10, 0.0, 0.8).unwrap();
550
551        // First layer: min drop prob
552        assert!((scheduler.get_drop_prob(0) - 0.0).abs() < 1e-10);
553
554        // Last layer: max drop prob
555        assert!((scheduler.get_drop_prob(9) - 0.8).abs() < 1e-10);
556
557        // Middle layers should have exponentially increasing drop prob
558        let mid_prob = scheduler.get_drop_prob(5);
559        let linear_mid = 0.4; // What it would be with linear
560
561        // Exponential should be less than linear in first half
562        assert!(mid_prob < linear_mid + 0.1);
563    }
564
565    #[test]
566    fn test_exponential_create_paths() {
567        let scheduler = ExponentialStochasticDepth::new(5, 0.0, 0.4).unwrap();
568        let paths = scheduler.create_drop_paths().unwrap();
569
570        assert_eq!(paths.len(), 5);
571
572        // Verify increasing drop probabilities
573        for i in 0..paths.len() - 1 {
574            assert!(paths[i].drop_prob <= paths[i + 1].drop_prob);
575        }
576    }
577}
tensorlogic_train/stochastic_depth.rs

tensorlogic_train/
stochastic_depth.rs