multistochgrad/
types.rs

1
2//! This file is inspired by the crate optimisation written by  Oliver Mader b52@reaktor42.de.  
3//! I kept the traits Function, FunctionC1, Summation and SummationC1 which provides
4//! the interface for users to define a minimisation problem.
5//! 
6//! 
7//! In fact when minimising summation function we often seek to minimize the mean of the terms
8//! which is the same but scales gradient and this makes the implementation of batched stochastic gradient
9//! more natural as we always computes mean gradient over terms taken into account.
10//! 
11//! I kept the names of methods and reimplemented according to the following changes:
12//! 
13//! 1. In batched stochastic gradient we need to define mean gradient on a subset of indexes. 
14//! 2. I use the crate ndarray which provides addition of vector and enables rayon for //
15//! 3. I got rid of the iterator on indexes as indexes are always usize.
16//! 4. The function minimize in Trait Minimizer takes a generic Argument for future extension
17//! 5. I use rayon to compute value of summation for values and gradients with parallel iterators
18//! 
19
20
21
22use rayon::prelude::*;
23
24use ndarray::{Array, Dimension};
25
26
27/// Defines an objective function `f` that is subject to minimization.
28/// trait must be sync to be able to be sent over threads
29pub trait Function<D:Dimension> : Sync {
30    /// Computes the objective function at a given `position` `x`, i.e., `f(x) = y`.
31    fn value(&self, position: &ndarray::Array<f64,D>) -> f64;
32}
33
34
35
36/// Defines an objective function `f` that is able to compute the first derivative
37/// `f'(x)`.
38pub trait FunctionC1<D:Dimension> : Function<D> {
39    /// Computes the gradient of the objective function at a given `position` `x`,
40    /// i.e., `∀ᵢ ∂/∂xᵢ f(x) = ∇f(x)`.
41    fn gradient(&self, position: &Array<f64,D>) -> Array<f64,D>;
42}
43
44
45/// Defines a summation of individual functions, i.e., f(x) = ∑ᵢ fᵢ(x).
46pub trait Summation<D:Dimension>: Function<D> {
47    /// Returns the number of individual functions that are terms of the summation.
48    fn terms(&self) -> usize;
49
50    /// Comptues the value of one individual function indentified by its index `term`,
51    /// given the `position` `x`.
52    fn term_value(&self, position: &Array<f64,D>, term: usize) -> f64;
53
54    /// Computes the partial sum over a set of individual functions identified by `terms`.
55    /// without dividing by anything!!
56    fn partial_value(&self, position:&Array<f64,D> , terms: &[usize]) -> f64 {
57      //  let mut value = 0.0;
58        let f = |t : usize| -> f64 { self.term_value(position, t)};
59        let value = terms.into_par_iter().map(|t| f(*t)).sum();
60        // for term in terms {
61        //     value += self.term_value(position, *term);
62        // }
63        value
64    }
65} // end trait Summation
66
67
68/// always return the mean of the sum
69impl<D : Dimension, S: Summation<D> > Function<D> for S {
70    fn value(&self, position: &Array<f64,D>) -> f64 {
71        let value = self.partial_value(position, &(0..self.terms()).into_iter().collect::<Vec<usize>>());
72        // 
73        value/(self.terms() as f64)
74    }
75}
76
77
78/// Defines a summation of individual functions `fᵢ(x)`, assuming that each function has a first
79/// derivative.
80pub trait SummationC1<D:Dimension> : Summation<D> + FunctionC1<D> {
81    /// The required method the user must furnish.
82    /// Computes the gradient of one individual function identified by `term` at the given
83    /// `position` `without a 1/n renormalization`
84    /// gradient index and position indexes must corrspond.
85    fn term_gradient(&self, position: &Array<f64, D>, term: &usize, gradient : &mut Array<f64, D>);
86
87    // gradient is passed as arg to avoid reaalocation!
88    /// Computes the sum of partial gradient over a set of `terms` at the given `position`.
89    /// `Wwithout a 1/n renormalization`
90    fn partial_gradient(&self, position: &Array<f64, D>, terms: &[usize], gradient : &mut Array<f64, D>) {
91        assert!(terms.len() > 0);
92        gradient.fill(0.);
93        let mut term_gradient : Array<f64, D> = gradient.clone();
94        // could Rayon // here if length of iterator i.e dimension dimension of data is very large.
95        if terms.len() < 1000 {
96            for term in terms.into_iter() {
97              self.term_gradient(position, &term, &mut term_gradient);
98              *gradient += &term_gradient;
99             }
100        }
101        else {
102            // we do not use directly rayon beccause we want to avoid too many allocations of term_gradient
103            // so we split iterations in blocks
104            let block_size = 500;
105            // to get start of i block
106            let nb_block = if terms.len() % block_size == 0 { 
107                    terms.len() / block_size
108                 }
109                 else {
110                    1 + (terms.len() / block_size) 
111            };
112            let first = | i : usize | -> usize {
113                let start = i * block_size;
114                start
115            };
116            let last =  | i : usize | -> usize {
117                let end = ((i+1) * block_size).min(terms.len());
118                end
119            };
120            let compute = | i: usize | ->  Array<f64, D> {
121                let mut block_gradient : Array<f64, D> = gradient.clone();
122                block_gradient.fill(0.);
123                let mut term_gradient : Array<f64, D> = gradient.clone();
124                for k in first(i)..last(i) {
125                    self.term_gradient(position, &terms[k], &mut term_gradient);
126                    block_gradient += &term_gradient;
127                }
128                block_gradient
129            }; // end compute function
130            //
131            // now we execute in // compute on each block
132            //
133          *gradient = (0..nb_block).into_par_iter().map(|b| compute(b)).reduce(|| gradient.clone(), | acc , g|  acc + g  );
134        }
135     } // end partial_gradient
136
137
138    /// in batched stochastic gradient we need means of gradient on batch or minibatch
139    fn mean_partial_gradient(&self, position: &Array<f64, D>, terms: &[usize], gradient : &mut Array<f64, D>)  {
140        self.partial_gradient(position, terms, gradient);
141        gradient.iter_mut().for_each(|x| *x /= terms.len() as f64);
142    }  // end of mean_partial_gradient
143
144
145}    // end trait SummationC1
146
147
148
149
150
151impl<D:Dimension, S: SummationC1<D> > FunctionC1<D> for S {
152    fn gradient(&self, position: &Array<f64, D>) -> Array<f64, D> {
153        let mut gradient : Array<f64, D> = position.clone();
154        gradient.fill(0.);
155        let mut gradient_term : Array<f64, D> = position.clone();
156        gradient_term.fill(0.);
157        // CAVEAT : user parametrization ?
158        if self.terms() < 2000 {
159            for term in 0..self.terms() {
160               self.term_gradient(position, &term, &mut gradient_term);
161               gradient += &gradient_term;
162             }
163        }
164        else {
165            // we do not use directly rayon beccause we want to avoid too many allocations of term_gradient
166            // so we split iterations in blocks
167            let block_size = 1000;
168            // to get start of i block
169            let nb_block = if self.terms() % block_size == 0 { 
170                    self.terms() / block_size
171                 }
172                 else {
173                    1 + (self.terms() / block_size) 
174            };
175            let first = | i : usize | -> usize {
176                let start = i * block_size;
177                start
178            };
179            let last =  | i : usize | -> usize {
180                let end = ((i+1) * block_size).min(self.terms());
181                end
182            };
183            let compute = | i: usize | ->  Array<f64, D> {
184                let mut block_gradient : Array<f64, D> = gradient.clone();
185                block_gradient.fill(0.);
186                let mut term_gradient : Array<f64, D> = gradient.clone();
187                for k in first(i)..last(i) {
188                    self.term_gradient(position, &k, &mut term_gradient);
189                    block_gradient += &term_gradient;
190                }
191                block_gradient
192            }; // end compute function
193            //
194            // now we execute in // compute on each block
195            //
196            gradient = (0..nb_block).into_par_iter().map(|b| compute(b)).reduce(|| gradient.clone(), | acc , g|  acc + g  );
197        }
198        //
199        gradient.iter_mut().for_each(|x| *x /= self.terms() as f64);
200        //
201        gradient
202    }
203
204    
205}  // end of impl<S: SummationC1> FunctionC1 for S
206
207
208/// Defines an optimizer that is able to minimize a given objective function `F`.
209pub trait Minimizer<D: Dimension, F: ?Sized, MinimizerArg> {
210    /// Type of the solution the `Minimizer` returns.
211    type Solution: Evaluation<D>;
212
213    /// Performs the actual minimization and returns a solution.
214    /// MinimizerArg should provide a number of iterations, a min error , or anything needed for implemented algorithm
215    fn minimize(&self, function: &F, initial_position: &Array<f64,D>, args : Option<MinimizerArg>) -> Self::Solution;
216}
217
218
219/// Captures the essence of a function evaluation.
220pub trait Evaluation<D:Dimension> {
221    /// Position `x` with the lowest corresponding value `f(x)`.
222    fn position(&self) -> &Array<f64,D>;
223
224    /// The actual value `f(x)`.
225    fn value(&self) -> f64;
226}
227
228
229/// A solution of a minimization run providing only the minimal information.
230///
231/// Each `Minimizer` might yield different types of solution structs which provide more
232/// information.
233#[derive(Debug, Clone)]
234pub struct Solution<D:Dimension> {
235    /// Position `x` of the lowest corresponding value `f(x)` that has been found.
236    pub position: Array<f64,D>,
237    /// The actual value `f(x)`.
238    pub value: f64
239}
240
241impl <D:Dimension> Solution<D> {
242    /// Creates a new `Solution` given the `position` as well as the corresponding `value`.
243    pub fn new(position: Array<f64,D>, value: f64) -> Solution<D> {
244        Solution {
245            position: position,
246            value: value
247        }
248    }
249}
250
251impl <D:Dimension> Evaluation<D> for Solution<D> {
252    fn position(&self) -> &Array<f64,D> {
253        &self.position
254    }
255
256    fn value(&self) -> f64 {
257        self.value
258    }
259}
260
261
262
263
264
265#[allow(dead_code)]
266pub fn norm_l2<D:Dimension>(gradient : &Array<f64,D>) -> f64 {
267    let norm = gradient.fold(0., |norm, x |  norm+ (x * x));
268    norm.sqrt()
269}