multistochgrad/types.rs
1
2//! This file is inspired by the crate optimisation written by Oliver Mader b52@reaktor42.de.
3//! I kept the traits Function, FunctionC1, Summation and SummationC1 which provides
4//! the interface for users to define a minimisation problem.
5//!
6//!
7//! In fact when minimising summation function we often seek to minimize the mean of the terms
8//! which is the same but scales gradient and this makes the implementation of batched stochastic gradient
9//! more natural as we always computes mean gradient over terms taken into account.
10//!
11//! I kept the names of methods and reimplemented according to the following changes:
12//!
13//! 1. In batched stochastic gradient we need to define mean gradient on a subset of indexes.
14//! 2. I use the crate ndarray which provides addition of vector and enables rayon for //
15//! 3. I got rid of the iterator on indexes as indexes are always usize.
16//! 4. The function minimize in Trait Minimizer takes a generic Argument for future extension
17//! 5. I use rayon to compute value of summation for values and gradients with parallel iterators
18//!
19
20
21
22use rayon::prelude::*;
23
24use ndarray::{Array, Dimension};
25
26
27/// Defines an objective function `f` that is subject to minimization.
28/// trait must be sync to be able to be sent over threads
29pub trait Function<D:Dimension> : Sync {
30 /// Computes the objective function at a given `position` `x`, i.e., `f(x) = y`.
31 fn value(&self, position: &ndarray::Array<f64,D>) -> f64;
32}
33
34
35
36/// Defines an objective function `f` that is able to compute the first derivative
37/// `f'(x)`.
38pub trait FunctionC1<D:Dimension> : Function<D> {
39 /// Computes the gradient of the objective function at a given `position` `x`,
40 /// i.e., `∀ᵢ ∂/∂xᵢ f(x) = ∇f(x)`.
41 fn gradient(&self, position: &Array<f64,D>) -> Array<f64,D>;
42}
43
44
45/// Defines a summation of individual functions, i.e., f(x) = ∑ᵢ fᵢ(x).
46pub trait Summation<D:Dimension>: Function<D> {
47 /// Returns the number of individual functions that are terms of the summation.
48 fn terms(&self) -> usize;
49
50 /// Comptues the value of one individual function indentified by its index `term`,
51 /// given the `position` `x`.
52 fn term_value(&self, position: &Array<f64,D>, term: usize) -> f64;
53
54 /// Computes the partial sum over a set of individual functions identified by `terms`.
55 /// without dividing by anything!!
56 fn partial_value(&self, position:&Array<f64,D> , terms: &[usize]) -> f64 {
57 // let mut value = 0.0;
58 let f = |t : usize| -> f64 { self.term_value(position, t)};
59 let value = terms.into_par_iter().map(|t| f(*t)).sum();
60 // for term in terms {
61 // value += self.term_value(position, *term);
62 // }
63 value
64 }
65} // end trait Summation
66
67
68/// always return the mean of the sum
69impl<D : Dimension, S: Summation<D> > Function<D> for S {
70 fn value(&self, position: &Array<f64,D>) -> f64 {
71 let value = self.partial_value(position, &(0..self.terms()).into_iter().collect::<Vec<usize>>());
72 //
73 value/(self.terms() as f64)
74 }
75}
76
77
78/// Defines a summation of individual functions `fᵢ(x)`, assuming that each function has a first
79/// derivative.
80pub trait SummationC1<D:Dimension> : Summation<D> + FunctionC1<D> {
81 /// The required method the user must furnish.
82 /// Computes the gradient of one individual function identified by `term` at the given
83 /// `position` `without a 1/n renormalization`
84 /// gradient index and position indexes must corrspond.
85 fn term_gradient(&self, position: &Array<f64, D>, term: &usize, gradient : &mut Array<f64, D>);
86
87 // gradient is passed as arg to avoid reaalocation!
88 /// Computes the sum of partial gradient over a set of `terms` at the given `position`.
89 /// `Wwithout a 1/n renormalization`
90 fn partial_gradient(&self, position: &Array<f64, D>, terms: &[usize], gradient : &mut Array<f64, D>) {
91 assert!(terms.len() > 0);
92 gradient.fill(0.);
93 let mut term_gradient : Array<f64, D> = gradient.clone();
94 // could Rayon // here if length of iterator i.e dimension dimension of data is very large.
95 if terms.len() < 1000 {
96 for term in terms.into_iter() {
97 self.term_gradient(position, &term, &mut term_gradient);
98 *gradient += &term_gradient;
99 }
100 }
101 else {
102 // we do not use directly rayon beccause we want to avoid too many allocations of term_gradient
103 // so we split iterations in blocks
104 let block_size = 500;
105 // to get start of i block
106 let nb_block = if terms.len() % block_size == 0 {
107 terms.len() / block_size
108 }
109 else {
110 1 + (terms.len() / block_size)
111 };
112 let first = | i : usize | -> usize {
113 let start = i * block_size;
114 start
115 };
116 let last = | i : usize | -> usize {
117 let end = ((i+1) * block_size).min(terms.len());
118 end
119 };
120 let compute = | i: usize | -> Array<f64, D> {
121 let mut block_gradient : Array<f64, D> = gradient.clone();
122 block_gradient.fill(0.);
123 let mut term_gradient : Array<f64, D> = gradient.clone();
124 for k in first(i)..last(i) {
125 self.term_gradient(position, &terms[k], &mut term_gradient);
126 block_gradient += &term_gradient;
127 }
128 block_gradient
129 }; // end compute function
130 //
131 // now we execute in // compute on each block
132 //
133 *gradient = (0..nb_block).into_par_iter().map(|b| compute(b)).reduce(|| gradient.clone(), | acc , g| acc + g );
134 }
135 } // end partial_gradient
136
137
138 /// in batched stochastic gradient we need means of gradient on batch or minibatch
139 fn mean_partial_gradient(&self, position: &Array<f64, D>, terms: &[usize], gradient : &mut Array<f64, D>) {
140 self.partial_gradient(position, terms, gradient);
141 gradient.iter_mut().for_each(|x| *x /= terms.len() as f64);
142 } // end of mean_partial_gradient
143
144
145} // end trait SummationC1
146
147
148
149
150
151impl<D:Dimension, S: SummationC1<D> > FunctionC1<D> for S {
152 fn gradient(&self, position: &Array<f64, D>) -> Array<f64, D> {
153 let mut gradient : Array<f64, D> = position.clone();
154 gradient.fill(0.);
155 let mut gradient_term : Array<f64, D> = position.clone();
156 gradient_term.fill(0.);
157 // CAVEAT : user parametrization ?
158 if self.terms() < 2000 {
159 for term in 0..self.terms() {
160 self.term_gradient(position, &term, &mut gradient_term);
161 gradient += &gradient_term;
162 }
163 }
164 else {
165 // we do not use directly rayon beccause we want to avoid too many allocations of term_gradient
166 // so we split iterations in blocks
167 let block_size = 1000;
168 // to get start of i block
169 let nb_block = if self.terms() % block_size == 0 {
170 self.terms() / block_size
171 }
172 else {
173 1 + (self.terms() / block_size)
174 };
175 let first = | i : usize | -> usize {
176 let start = i * block_size;
177 start
178 };
179 let last = | i : usize | -> usize {
180 let end = ((i+1) * block_size).min(self.terms());
181 end
182 };
183 let compute = | i: usize | -> Array<f64, D> {
184 let mut block_gradient : Array<f64, D> = gradient.clone();
185 block_gradient.fill(0.);
186 let mut term_gradient : Array<f64, D> = gradient.clone();
187 for k in first(i)..last(i) {
188 self.term_gradient(position, &k, &mut term_gradient);
189 block_gradient += &term_gradient;
190 }
191 block_gradient
192 }; // end compute function
193 //
194 // now we execute in // compute on each block
195 //
196 gradient = (0..nb_block).into_par_iter().map(|b| compute(b)).reduce(|| gradient.clone(), | acc , g| acc + g );
197 }
198 //
199 gradient.iter_mut().for_each(|x| *x /= self.terms() as f64);
200 //
201 gradient
202 }
203
204
205} // end of impl<S: SummationC1> FunctionC1 for S
206
207
208/// Defines an optimizer that is able to minimize a given objective function `F`.
209pub trait Minimizer<D: Dimension, F: ?Sized, MinimizerArg> {
210 /// Type of the solution the `Minimizer` returns.
211 type Solution: Evaluation<D>;
212
213 /// Performs the actual minimization and returns a solution.
214 /// MinimizerArg should provide a number of iterations, a min error , or anything needed for implemented algorithm
215 fn minimize(&self, function: &F, initial_position: &Array<f64,D>, args : Option<MinimizerArg>) -> Self::Solution;
216}
217
218
219/// Captures the essence of a function evaluation.
220pub trait Evaluation<D:Dimension> {
221 /// Position `x` with the lowest corresponding value `f(x)`.
222 fn position(&self) -> &Array<f64,D>;
223
224 /// The actual value `f(x)`.
225 fn value(&self) -> f64;
226}
227
228
229/// A solution of a minimization run providing only the minimal information.
230///
231/// Each `Minimizer` might yield different types of solution structs which provide more
232/// information.
233#[derive(Debug, Clone)]
234pub struct Solution<D:Dimension> {
235 /// Position `x` of the lowest corresponding value `f(x)` that has been found.
236 pub position: Array<f64,D>,
237 /// The actual value `f(x)`.
238 pub value: f64
239}
240
241impl <D:Dimension> Solution<D> {
242 /// Creates a new `Solution` given the `position` as well as the corresponding `value`.
243 pub fn new(position: Array<f64,D>, value: f64) -> Solution<D> {
244 Solution {
245 position: position,
246 value: value
247 }
248 }
249}
250
251impl <D:Dimension> Evaluation<D> for Solution<D> {
252 fn position(&self) -> &Array<f64,D> {
253 &self.position
254 }
255
256 fn value(&self) -> f64 {
257 self.value
258 }
259}
260
261
262
263
264
265#[allow(dead_code)]
266pub fn norm_l2<D:Dimension>(gradient : &Array<f64,D>) -> f64 {
267 let norm = gradient.fold(0., |norm, x | norm+ (x * x));
268 norm.sqrt()
269}