multistochgrad 0.1.5

Stochastic Gradient, Multithreaded
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
//! A Rust implementation of Lei-Jordan papers:     
//! - ``On the adaptativity of Stochastic gradient based optimization`` (2019,2020) [SCSG-1](https://arxiv.org/abs/1904.04480)  
//! - ``Less than a single pass : stochastically controlled stochastic gradient`` (2019)  [SCSG-2](https://arxiv.org/abs/1609.03261)

use log::Level::*;
use log::{debug, error, info, log_enabled, trace, warn};

use cpu_time::ProcessTime;
use std::time::SystemTime;

use std::iter::FromIterator;

use rand::distributions::Distribution;
use rand::SeedableRng;
// a fast but non crypto secure algo. method jump to use in // !!!!
use ndarray::{Array, Dimension};
use rand_xoshiro::Xoshiro256PlusPlus;

use crate::monitor::*;
use crate::types::*;

// a struct to have batch size (large and mini) to use at a given iteration
// we must have large_batch >= nb_mini_batch_parameter >= mini_batch  (Cf Th 4.1)
pub struct BatchSizeInfo {
    _step: usize,
    // large batch size
    large_batch: usize,
    // mini batvh size
    mini_batch: usize,
    // nb mini bath parameter
    nb_mini_batch_parameter: f64,
    // step size
    step_size: f64,
}

///
/// Provides Stochastic Controlled Gradient Descent optimization based on 2 papers of Lei-Jordan.  
/// - ``"On the adaptativity of stochastic gradient based optimisation"`` arxiv 2019,2020  [SCSG-1](https://arxiv.org/abs/1904.04480)
/// - ``"Less than a single pass : stochastically controlled stochastic gradient"`` arxiv 2019 [SCSG-2](https://arxiv.org/abs/1609.03261)
///
/// According to the first paper we have the following notations:
///  
/// One iteration j consists in :
///    - a large batch of size Bⱼ
///    - a number noted mⱼ of small batches of size bⱼ
///    - update position with a step ηⱼ.
/// The number of mini batch is described by a random variable with a geometric law.
///
/// The paper establishes rates of convergence depending on the ratio
/// mⱼ/Bⱼ , bⱼ/mⱼ and ηⱼ/bⱼ and their products.
///  
/// The second paper :  
/// "Less than a single pass : stochastically controlled stochastic gradient"  
/// describes a simplified version where the mini batches consist in just one term
/// and the number of mini batch is set to the mean of the geometric variable corresponding to
/// number of mini batches.
///   
/// We adopt a mix of the two papers:
///  - Letting the size of mini batch grow a little seems more stable than keeping it to 1.
/// (in particular when initialization of the algorithm varies.)
/// but replacing the geometric law by its mean is really more stable due to the large variance of its law.  
///
///  - We choose a fraction of the number of terms in the sum (*large_batch_fraction_init*) and alfa
/// so that large_batch_fraction_init * alfa^(2*nbiter) = 1.  
///
/// Then if nbterms is the number of terms in function to minimize and j the iteration number:
///
///  -    Bⱼ evolves as :   large_batch_fraction_init * nbterms * alfa^(2j)
///  -    mⱼ evolves as :   m_zero * nbterms * alfa^(3j/2)
///  -    bⱼ evolves as :   b_0 * alfa^j
///  -    ηⱼ evolves as :   eta_0 / alfa^(j/2)
///     
///
///  The evolution of Bⱼ is bounded above by nbterms/10 (can be modified with [Self::set_large_batch_max_fraction()]) and bⱼ by nbterms/100.  
///  The size of small batch must stay small so b₀ must be small (typically 1 seems OK)
///  
///
pub struct StochasticControlledGradientDescent {
    rng: Xoshiro256PlusPlus,
    /// step_size initialization
    eta_zero: f64,
    /// multiplicative factor in mⱼ,  governing evolution of nb small iterations
    m_zero: f64,
    /// b_0 in the paper. governs the size of mini batches
    mini_batch_size_init: usize,
    /// related to B_0 in Paper. Fraction of terms to consider in initialisation of B_0
    large_batch_fraction_init: f64,
    /// maximum ratio ; size of large batch / nb_terms in the sum. (default to 0.1)
    large_batch_max_fraction: f64,
}

impl StochasticControlledGradientDescent {
    /// args are :
    ///   - eta_zero : initial value of step along gradient value of 0.1 is a good default choice.
    ///   - m_zero : a good value is 0.2 *large_batch_fraction_init so that  mⱼ << Bⱼ
    ///   - mini_batch_size_init : base value for size of mini_batchs : a value of 1 is a good default choice
    ///   - large_batch_fraction_init : fraction of nbterms to initialize large batch size : a good default value is between 0.01 and
    ///             0.02 so that large batch size begins at 0.01 * nbterms or 0.02 * nbterms.
    ///
    /// (see examples)
    ///
    pub fn new(
        eta_zero: f64,
        m_zero: f64,
        mini_batch_size_init: usize,
        large_batch_fraction_init: f64,
    ) -> StochasticControlledGradientDescent {
        //
        if large_batch_fraction_init > 1. {
            warn!("large_batch_size_init > 1. , fraction factor for large_batch size initialization must be < 1. , exiting");
            std::process::exit(1);
        }
        if m_zero > large_batch_fraction_init {
            warn!("m_zero > large_batch_size_init fraction , base fraction for nb mini batch should be less than for large_batch_size");
            std::process::exit(1);
        }
        info!(" eta_zero {:2.4E} m_zero {:2.4} \n mini batch size {:?} , large_batch_fraction_init {:2.4E}", 
            eta_zero, m_zero, mini_batch_size_init, large_batch_fraction_init);
        //
        StochasticControlledGradientDescent {
            rng: Xoshiro256PlusPlus::seed_from_u64(4664397),
            eta_zero: eta_zero,
            m_zero: m_zero,
            mini_batch_size_init: mini_batch_size_init,
            large_batch_fraction_init: large_batch_fraction_init as f64,
            large_batch_max_fraction: 0.1,
        }
    }
    /// Seeds the random number generator using the supplied `seed`.
    /// This is useful to create re-producable results.
    pub fn seed(&mut self, seed: [u8; 32]) {
        self.rng = Xoshiro256PlusPlus::from_seed(seed);
    }

    /// if larger batch size is needed  maximum large batch size will be set to:
    /// nb_terms * fraction (default for fraction is 0.1)
    pub fn set_large_batch_max_fraction(&mut self, fraction: f64) {
        assert!(0.01 < fraction && fraction < 1.);
        self.large_batch_max_fraction = fraction;
    }

    // batch growing factor cannot be too large, so it must be adjusted accoding to nbterms value, and nb_max_iterations
    // we choose the batch growing factor alfa so that :
    //     B_0 * alfa^(2*nb_max_iterations) = 1.
    // (B_0 is the fraction of nbterms we take at beginning of iterations)
    fn estimate_batch_growing_factor(&self, nb_max_iterations: usize, nbterms: usize) -> f64 {
        let batch_growing_factor: f64;
        //
        if self.m_zero * (nbterms as f64) < 1. {
            error!("m_zero fraction , should be greater than 1./ number of terms in sum");
            std::process::exit(1);
        }
        //
        let log_alfa = (-self.large_batch_fraction_init.ln()) / (2. * nb_max_iterations as f64);
        batch_growing_factor = log_alfa.exp();
        if batch_growing_factor <= 1. {
            println!("batch growing factor shoud be greater than 1. , possibly you can reduce number of iterations ");
        }
        debug!(
            " upper bound for batch_growing_factor  {:2.4E}",
            batch_growing_factor
        );
        //
        return batch_growing_factor;
    } // end of estimate_batch_growing_factor

    // returns BatchSizeInfo for current iteration
    fn get_batch_size_at_jstep(
        &self,
        batch_growing_factor: f64,
        nbterms: usize,
        j: usize,
    ) -> BatchSizeInfo {
        let alfa_j = batch_growing_factor.powi(j as i32);
        // max size of large batch is 100 or  self.large_batch_max_fraction * the number of terms
        let max_large_batch_size;
        if nbterms > 100 {
            max_large_batch_size = (nbterms as f64 * self.large_batch_max_fraction).ceil() as usize;
        } else {
            max_large_batch_size = nbterms;
        }
        // ensure max_mini_batch_size is at least 1.
        let max_mini_batch_size = (nbterms as f64 / 100.).ceil() as usize;
        // B_j
        let large_batch_size =
            ((self.large_batch_fraction_init * (nbterms as f64) * alfa_j * alfa_j).ceil() as usize)
                .min(max_large_batch_size);
        // b_j  grow slowly as log(1. + )
        let mini_batch_size =
            ((self.mini_batch_size_init as f64 * alfa_j).floor() as usize).min(max_mini_batch_size);
        // m_j  computed to ensure mean number of mini batch < large_batch_size as mini_batch_size_init < large_batch_size_init is enfored
        let nb_mini_batch_parameter = self.m_zero * (nbterms as f64) * alfa_j.powf(1.5);
        // let step decrease really slowly
        let step_size = self.eta_zero / alfa_j.sqrt();
        //
        BatchSizeInfo {
            _step: j,
            large_batch: large_batch_size,
            mini_batch: mini_batch_size,
            nb_mini_batch_parameter: nb_mini_batch_parameter,
            step_size: step_size,
        }
    } // end of get_batch_size_at_jstep
    ///
    /// sample number of mini batch according to geometric law of parameter p = b_j/(m_j+b_j)
    /// with law : P(N=k) = (1-p) * p^k. (proba of nb trial before success mode)
    fn get_nb_small_mini_batches(&self, batch_size_info: &BatchSizeInfo) -> usize {
        let m_j = batch_size_info.nb_mini_batch_parameter as f64;
        let b_j = batch_size_info.mini_batch as f64;
        // we return mean of geometric. Sampling too much instable due to large variance of geometric distribution.
        let mut n_j = (m_j / b_j).ceil() as usize;
        n_j = n_j.min(batch_size_info.large_batch);
        debug!(
            " nb small mini batch {:?} m_j {:2.4E} b_j : {:2.4E} ",
            n_j, m_j, b_j
        );
        return n_j;
    }
} // end impl StochasticControlledGradientDescent

// used to sample mini batches, Really faster.
//  1. Faster methods for Random Sampling J.S Vitter Comm ACM 1984
// 2. Kim-Hung Li Reservoir Sampling Algorithms : Comm ACM Vol 20, 4 December 1994
//  3. https://en.wikipedia.org/wiki/Reservoir_sampling

fn sample_without_replacement_reservoir(
    size_asked: usize,
    in_terms: &[usize],
    rng: &mut Xoshiro256PlusPlus,
) -> Vec<usize> {
    let mut out_terms = Vec::<usize>::with_capacity(size_asked.min(in_terms.len()));
    for i in 0..size_asked {
        out_terms.push(in_terms[i]);
    }
    let mut xsi: f64;
    xsi = rand_distr::Standard.sample(rng);
    let mut w: f64 = (xsi.ln() / (size_asked as f64)).exp();

    let mut s = size_asked - 1;
    while s < in_terms.len() {
        xsi = rand_distr::Standard.sample(rng);
        s = s + (xsi.ln() / (1. - w).ln()).floor() as usize + 1;
        if s < in_terms.len() {
            // update random index in out_terms
            xsi = rand_distr::Standard.sample(rng);
            let idx = (size_asked as f64 * xsi).floor() as usize;
            out_terms[idx] = in_terms[s];
            // update w
            xsi = rand_distr::Standard.sample(rng);
            w = w * (xsi.ln() / (size_asked as f64)).exp();
        }
    }
    out_terms
}

//=====================================================================================

impl Default for StochasticControlledGradientDescent {
    fn default() -> Self {
        StochasticControlledGradientDescent {
            rng: Xoshiro256PlusPlus::seed_from_u64(4664397),
            eta_zero: 0.1,
            m_zero: 0.04,
            mini_batch_size_init: 1,
            large_batch_fraction_init: 0.1,
            large_batch_max_fraction: 0.1,
        }
    }
}

//========================================================================================

impl<D: Dimension, F: SummationC1<D>> Minimizer<D, F, usize>
    for StochasticControlledGradientDescent
{
    type Solution = Solution<D>;

    fn minimize(
        &self,
        function: &F,
        initial_position: &Array<f64, D>,
        max_iterations: Option<usize>,
    ) -> Solution<D> {
        //
        let cpu_start = ProcessTime::now();
        let sys_now = SystemTime::now();
        //
        let mut position = initial_position.clone();
        let mut value = function.value(&position);
        let nb_max_iterations = max_iterations.unwrap();
        // direction propagation
        let mut direction: Array<f64, D> = position.clone();
        direction.fill(0.);

        if log_enabled!(Info) {
            info!("Starting with y = {:e} \n for x = {:?}", value, position);
        } else {
            info!("Starting with y = {:e}", value);
        }
        trace!("nb_max_iterations {:?}", nb_max_iterations);
        //
        let mut iteration: usize = 0;
        let mut rng = self.rng.clone();
        let nb_terms = function.terms();
        let mut monitoring = IterationRes::<D>::new(nb_max_iterations, SolMode::Last);
        let batch_growing_factor =
            self.estimate_batch_growing_factor(nb_max_iterations, function.terms());
        // temporary gradients passed by ref to avoid possibly large reallocation
        let mut large_batch_gradient: Array<f64, D> = position.clone();
        large_batch_gradient.fill(0.);
        //
        let mut mini_batch_gradient_current: Array<f64, D>;
        mini_batch_gradient_current = position.clone();
        mini_batch_gradient_current.fill(0.);
        //
        let mut mini_batch_gradient_origin: Array<f64, D>;
        mini_batch_gradient_origin = position.clone();
        mini_batch_gradient_origin.fill(0.);
        //
        let all_indexes = Vec::<usize>::from_iter::<std::ops::Range<usize>>(0..nb_terms);
        // now we work
        loop {
            // get iteration parameters
            let iter_params =
                self.get_batch_size_at_jstep(batch_growing_factor, nb_terms, iteration);
            let n_j = self.get_nb_small_mini_batches(&iter_params);
            let step_size = iter_params.step_size;
            // sample large batch of size Bj
            let large_batch_indexes = sample_without_replacement_reservoir(
                iter_params.large_batch,
                &all_indexes,
                &mut rng,
            );
            trace!("\n iter {:?} got large batch size {:?}, nb mini batch {:?}, mini batch size {:?}, step {:2.4E}", 
                    iteration, large_batch_indexes.len(), n_j, iter_params.mini_batch, iter_params.step_size);
            // compute gradient on large batch index set and store initial position
            function.mean_partial_gradient(
                &position,
                &large_batch_indexes,
                &mut large_batch_gradient,
            );
            let position_before_mini_batch = position.clone();
            // loop on small batch iterations
            for _k in 0..n_j {
                // sample mini batch terms
                let terms = sample_without_replacement_reservoir(
                    iter_params.mini_batch,
                    &all_indexes,
                    &mut rng,
                );
                //
                function.mean_partial_gradient(&position, &terms, &mut mini_batch_gradient_current);
                //
                function.mean_partial_gradient(
                    &position_before_mini_batch,
                    &terms,
                    &mut mini_batch_gradient_origin,
                );
                //
                // if log_enabled!(Trace)  {
                //     if _k == 0 {
                //         assert!(norm_l2(&mini_batch_gradient_origin) > 0.);
                //         trace!("mini_batch_gradient_origin  L2 {:2.4E} ", norm_l2(&mini_batch_gradient_origin));
                //     }
                //     else {
                //         trace!("mini_batch_gradient_current L2 {:2.4E} ", norm_l2(&mini_batch_gradient_current));
                //         assert!(norm_l2(&mini_batch_gradient_current) > 0.);
                //     }
                // }
                //
                direction = &mini_batch_gradient_current - &mini_batch_gradient_origin
                    + &large_batch_gradient;
                // step into the direction of the negative gradient
                position = position - step_size * &direction;
            } // end mini batch loop
              // update position
            iteration += 1;
            // some monitoring
            value = function.value(&position);
            let gradnorm = norm_l2(&direction);
            monitoring.push(value, &position, gradnorm);
            if log_enabled!(Debug) {
                debug!(
                    "\n Iteration {:?} y = {:2.4E} , gradient norm : {:2.4}",
                    iteration, value, gradnorm
                );
            }
            // convergence control or max iterations control
            if iteration >= nb_max_iterations {
                break;
            }
        } // end global loop
          //
        log::info!("\n StochasticControlledGradientDescent::minimize ; sys time(ms) {:?} cpu time(ms) {:?}", 
        sys_now.elapsed().unwrap().as_millis(),
        cpu_start.elapsed().as_millis());
        info!(
            "Reached maximal number of iterations required {:?}, stopping optimization",
            nb_max_iterations
        );
        let rank = monitoring.check_monoticity();
        info!(" monotonous convergence from rank : {:?}", rank);
        return Solution::new(position, value);
    } // end minimize
} // end impl impl<F: Summation1> Minimizer<F>

//==================================================================================

#[cfg(test)]
mod tests {

    //    use std::iter::FromIterator;
    use super::*;

    #[test]
    fn test_reservoir_sampling() {
        let mut rng = Xoshiro256PlusPlus::seed_from_u64(4664397);
        let nb_asked = 100;
        let in_terms = Vec::<usize>::from_iter::<std::ops::Range<usize>>(0..60000);
        let selected_terms = sample_without_replacement_reservoir(nb_asked, &in_terms, &mut rng);
        //
        assert_eq!(selected_terms.len(), nb_asked);
        // sort and print
    }
} // end of mod tests