rusty_machine/learning/
nnet.rs

1//! Neural Network module
2//!
3//! Contains implementation of simple feed forward neural network.
4//!
5//! # Usage
6//!
7//! ```
8//! use rusty_machine::learning::nnet::{NeuralNet, BCECriterion};
9//! use rusty_machine::learning::toolkit::regularization::Regularization;
10//! use rusty_machine::learning::optim::grad_desc::StochasticGD;
11//! use rusty_machine::linalg::Matrix;
12//! use rusty_machine::learning::SupModel;
13//!
14//! let inputs = Matrix::new(5,3, vec![1.,1.,1.,2.,2.,2.,3.,3.,3.,
15//!                                 4.,4.,4.,5.,5.,5.,]);
16//! let targets = Matrix::new(5,3, vec![1.,0.,0.,0.,1.,0.,0.,0.,1.,
17//!                                     0.,0.,1.,0.,0.,1.]);
18//!
19//! // Set the layer sizes - from input to output
20//! let layers = &[3,5,11,7,3];
21//!
22//! // Choose the BCE criterion with L2 regularization (`lambda=0.1`).
23//! let criterion = BCECriterion::new(Regularization::L2(0.1));
24//!
25//! // We will just use the default stochastic gradient descent.
26//! let mut model = NeuralNet::new(layers, criterion, StochasticGD::default());
27//!
28//! // Train the model!
29//! model.train(&inputs, &targets).unwrap();
30//!
31//! let test_inputs = Matrix::new(2,3, vec![1.5,1.5,1.5,5.1,5.1,5.1]);
32//!
33//! // And predict new output from the test inputs
34//! let outputs = model.predict(&test_inputs).unwrap();
35//! ```
36//!
37//! The neural networks are specified via a criterion - similar to
38//! [Torch](https://github.com/torch/nn/blob/master/doc/criterion.md).
39//! The criterions combine an activation function and a cost function.
40//!
41//! You can define your own criterion by implementing the `Criterion`
42//! trait with a concrete `ActivationFunc` and `CostFunc`.
43
44use linalg::{Matrix, MatrixSlice, BaseMatrix, BaseMatrixMut};
45
46use learning::{LearningResult, SupModel};
47use learning::error::{Error, ErrorKind};
48use learning::toolkit::activ_fn;
49use learning::toolkit::activ_fn::ActivationFunc;
50use learning::toolkit::cost_fn;
51use learning::toolkit::cost_fn::CostFunc;
52use learning::toolkit::regularization::Regularization;
53use learning::optim::{Optimizable, OptimAlgorithm};
54use learning::optim::grad_desc::StochasticGD;
55
56use rand::thread_rng;
57use rand::distributions::{Sample, range};
58
59/// Neural Network Model
60///
61/// The Neural Network struct specifies a Criterion and
62/// a gradient descent algorithm.
63#[derive(Debug)]
64pub struct NeuralNet<'a, T, A>
65    where T: Criterion,
66          A: OptimAlgorithm<BaseNeuralNet<'a, T>>
67{
68    base: BaseNeuralNet<'a, T>,
69    alg: A,
70}
71
72/// Supervised learning for the Neural Network.
73///
74/// The model is trained using back propagation.
75impl<'a, T, A> SupModel<Matrix<f64>, Matrix<f64>> for NeuralNet<'a, T, A>
76    where T: Criterion,
77          A: OptimAlgorithm<BaseNeuralNet<'a, T>>
78{
79    /// Predict neural network output using forward propagation.
80    fn predict(&self, inputs: &Matrix<f64>) -> LearningResult<Matrix<f64>> {
81        self.base.forward_prop(inputs)
82    }
83
84    /// Train the model using gradient optimization and back propagation.
85    fn train(&mut self, inputs: &Matrix<f64>, targets: &Matrix<f64>) -> LearningResult<()> {
86        let optimal_w = self.alg.optimize(&self.base, &self.base.weights, inputs, targets);
87        self.base.weights = optimal_w;
88        Ok(())
89    }
90}
91
92impl<'a> NeuralNet<'a, BCECriterion, StochasticGD> {
93    /// Creates a neural network with the specified layer sizes.
94    ///
95    /// The layer sizes slice should include the input, hidden layers, and output layer sizes.
96    /// The type of activation function must be specified.
97    ///
98    /// Uses the default settings (stochastic gradient descent and sigmoid activation function).
99    ///
100    /// # Examples
101    ///
102    /// ```
103    /// use rusty_machine::learning::nnet::NeuralNet;
104    ///
105    /// // Create a neural net with 4 layers, 3 neurons in each.
106    /// let layers = &[3; 4];
107    /// let mut net = NeuralNet::default(layers);
108    /// ```
109    pub fn default(layer_sizes: &[usize]) -> NeuralNet<BCECriterion, StochasticGD> {
110        NeuralNet {
111            base: BaseNeuralNet::default(layer_sizes),
112            alg: StochasticGD::default(),
113        }
114    }
115}
116
117impl<'a, T, A> NeuralNet<'a, T, A>
118    where T: Criterion,
119          A: OptimAlgorithm<BaseNeuralNet<'a, T>>
120{
121    /// Create a new neural network with the specified layer sizes.
122    ///
123    /// The layer sizes slice should include the input, hidden layers, and output layer sizes.
124    /// The type of activation function must be specified.
125    ///
126    /// Currently defaults to simple batch Gradient Descent for optimization.
127    ///
128    /// # Examples
129    ///
130    /// ```
131    /// use rusty_machine::learning::nnet::BCECriterion;
132    /// use rusty_machine::learning::nnet::NeuralNet;
133    /// use rusty_machine::learning::optim::grad_desc::StochasticGD;
134    ///
135    /// // Create a neural net with 4 layers, 3 neurons in each.
136    /// let layers = &[3; 4];
137    /// let mut net = NeuralNet::new(layers, BCECriterion::default(), StochasticGD::default());
138    /// ```
139    pub fn new(layer_sizes: &'a [usize], criterion: T, alg: A) -> NeuralNet<'a, T, A> {
140        NeuralNet {
141            base: BaseNeuralNet::new(layer_sizes, criterion),
142            alg: alg,
143        }
144    }
145
146    /// Gets matrix of weights between specified layer and forward layer.
147    ///
148    /// # Examples
149    ///
150    /// ```
151    /// use rusty_machine::linalg::BaseMatrix;
152    /// use rusty_machine::learning::nnet::NeuralNet;
153    ///
154    /// // Create a neural net with 4 layers, 3 neurons in each.
155    /// let layers = &[3; 4];
156    /// let mut net = NeuralNet::default(layers);
157    ///
158    /// let w = &net.get_net_weights(2);
159    ///
160    /// // We add a bias term to the weight matrix
161    /// assert_eq!(w.rows(), 4);
162    /// assert_eq!(w.cols(), 3);
163    /// ```
164    pub fn get_net_weights(&self, idx: usize) -> MatrixSlice<f64> {
165        self.base.get_layer_weights(&self.base.weights[..], idx)
166    }
167}
168
169/// Base Neural Network struct
170///
171/// This struct cannot be instantianated and is used internally only.
172#[derive(Debug)]
173pub struct BaseNeuralNet<'a, T: Criterion> {
174    layer_sizes: &'a [usize],
175    weights: Vec<f64>,
176    criterion: T,
177}
178
179
180impl<'a> BaseNeuralNet<'a, BCECriterion> {
181    /// Creates a base neural network with the specified layer sizes.
182    fn default(layer_sizes: &[usize]) -> BaseNeuralNet<BCECriterion> {
183        BaseNeuralNet::new(layer_sizes, BCECriterion::default())
184    }
185}
186
187
188impl<'a, T: Criterion> BaseNeuralNet<'a, T> {
189    /// Create a new base neural network with the specified layer sizes.
190    fn new(layer_sizes: &[usize], criterion: T) -> BaseNeuralNet<T> {
191        BaseNeuralNet {
192            layer_sizes: layer_sizes,
193            weights: BaseNeuralNet::<T>::create_weights(layer_sizes),
194            criterion: criterion,
195        }
196    }
197
198    /// Creates initial weights for all neurons in the network.
199    fn create_weights(layer_sizes: &[usize]) -> Vec<f64> {
200        let mut between = range::Range::new(0f64, 1f64);
201        let mut rng = thread_rng();
202        layer_sizes.windows(2)
203            .flat_map(|w| {
204                let l_in = w[0] + 1;
205                let l_out = w[1];
206                let eps_init = (6f64 / (l_in + l_out) as f64).sqrt();
207                (0..l_in * l_out)
208                    .map(|_i| (between.sample(&mut rng) * 2f64 * eps_init) - eps_init)
209                    .collect::<Vec<_>>()
210            })
211            .collect()
212    }
213
214    /// Gets matrix of weights between specified layer and forward layer for the weights.
215    fn get_layer_weights(&self, weights: &[f64], idx: usize) -> MatrixSlice<f64> {
216        debug_assert!(idx < self.layer_sizes.len() - 1);
217
218        // Check that the weights are the right size.
219        let mut full_size = 0usize;
220        for l in 0..self.layer_sizes.len() - 1 {
221            full_size += (self.layer_sizes[l] + 1) * self.layer_sizes[l + 1];
222        }
223
224        debug_assert_eq!(full_size, weights.len());
225
226        let mut start = 0usize;
227
228        for l in 0..idx {
229            start += (self.layer_sizes[l] + 1) * self.layer_sizes[l + 1]
230        }
231
232        unsafe {
233            MatrixSlice::from_raw_parts(weights.as_ptr().offset(start as isize),
234                                        self.layer_sizes[idx] + 1,
235                                        self.layer_sizes[idx + 1],
236                                        self.layer_sizes[idx + 1])
237        }
238
239    }
240
241    /// Gets matrix of weights between specified layer and forward layer
242    /// for the base model.
243    fn get_net_weights(&self, idx: usize) -> MatrixSlice<f64> {
244        self.get_layer_weights(&self.weights[..], idx)
245    }
246
247    /// Gets the weights for a layer excluding the bias weights.
248    fn get_non_bias_weights(&self, weights: &[f64], idx: usize) -> MatrixSlice<f64> {
249        let layer_weights = self.get_layer_weights(weights, idx);
250        layer_weights.reslice([1, 0], layer_weights.rows() - 1, layer_weights.cols())
251    }
252
253    /// Compute the gradient using the back propagation algorithm.
254    fn compute_grad(&self,
255                    weights: &[f64],
256                    inputs: &Matrix<f64>,
257                    targets: &Matrix<f64>)
258                    -> (f64, Vec<f64>) {
259        assert_eq!(inputs.cols(), self.layer_sizes[0]);
260
261        let mut forward_weights = Vec::with_capacity(self.layer_sizes.len() - 1);
262        let mut activations = Vec::with_capacity(self.layer_sizes.len());
263
264        let net_data = Matrix::ones(inputs.rows(), 1).hcat(inputs);
265
266        activations.push(net_data.clone());
267
268        // Forward propagation
269        {
270            let mut z = net_data * self.get_layer_weights(weights, 0);
271            forward_weights.push(z.clone());
272
273            for l in 1..self.layer_sizes.len() - 1 {
274                let mut a = self.criterion.activate(z.clone());
275                let ones = Matrix::ones(a.rows(), 1);
276
277                a = ones.hcat(&a);
278
279                z = &a * self.get_layer_weights(weights, l);
280                activations.push(a);
281                forward_weights.push(z.clone());
282            }
283
284            activations.push(self.criterion.activate(z));
285        }
286
287        let mut deltas = Vec::with_capacity(self.layer_sizes.len() - 1);
288        // Backward propagation
289        {
290            let z = forward_weights[self.layer_sizes.len() - 2].clone();
291            let g = self.criterion.grad_activ(z);
292
293            // Take GRAD_cost to compute this delta.
294            let mut delta = self.criterion
295                .cost_grad(&activations[self.layer_sizes.len() - 1], targets)
296                .elemul(&g);
297
298            deltas.push(delta.clone());
299
300            for l in (1..self.layer_sizes.len() - 1).rev() {
301                let mut z = forward_weights[l - 1].clone();
302                let ones = Matrix::ones(z.rows(), 1);
303                z = ones.hcat(&z);
304
305                let g = self.criterion.grad_activ(z);
306                delta = (delta * Matrix::from(self.get_layer_weights(weights, l)).transpose())
307                    .elemul(&g);
308
309                let non_one_rows = &(1..delta.cols()).collect::<Vec<usize>>()[..];
310                delta = delta.select_cols(non_one_rows);
311                deltas.push(delta.clone());
312            }
313        }
314
315        let mut gradients = Vec::with_capacity(weights.len());
316
317        for (l, activ_item) in activations.iter().take(self.layer_sizes.len() - 1).enumerate() {
318            // Compute the gradient
319            let mut g = deltas[self.layer_sizes.len() - 2 - l].transpose() * activ_item;
320
321            // Add the regularized gradient
322            if self.criterion.is_regularized() {
323                let layer = l;
324                let non_bias_weights = self.get_non_bias_weights(weights, layer);
325                let zeros = Matrix::zeros(1, non_bias_weights.cols());
326                g += zeros.vcat(&self.criterion.reg_cost_grad(non_bias_weights));
327            }
328
329            gradients.append(&mut (g / inputs.rows() as f64).into_vec());
330        }
331
332        // Compute the cost
333        let mut cost = self.criterion.cost(&activations[activations.len() - 1], targets);
334
335        // Add the regularized cost
336        if self.criterion.is_regularized() {
337            for i in 0..self.layer_sizes.len() - 1 {
338                cost += self.criterion.reg_cost(self.get_non_bias_weights(weights, i));
339            }
340        }
341
342        (cost, gradients)
343    }
344
345    /// Forward propagation of the model weights to get the outputs.
346    fn forward_prop(&self, inputs: &Matrix<f64>) -> LearningResult<Matrix<f64>> {
347        if inputs.cols() != self.layer_sizes[0] {
348            Err(Error::new(ErrorKind::InvalidData,
349                           "The input data dimensions must match the first layer."))
350        } else {
351            let net_data = Matrix::ones(inputs.rows(), 1).hcat(inputs);
352
353            let mut z = net_data * self.get_net_weights(0);
354            let mut a = self.criterion.activate(z.clone());
355
356            for l in 1..self.layer_sizes.len() - 1 {
357                let ones = Matrix::ones(a.rows(), 1);
358                a = ones.hcat(&a);
359                z = a * self.get_net_weights(l);
360                a = self.criterion.activate(z.clone());
361            }
362
363            Ok(a)
364        }
365    }
366}
367
368/// Compute the gradient of the Neural Network using the
369/// back propagation algorithm.
370impl<'a, T: Criterion> Optimizable for BaseNeuralNet<'a, T> {
371    type Inputs = Matrix<f64>;
372    type Targets = Matrix<f64>;
373
374    /// Compute the gradient of the neural network.
375    fn compute_grad(&self,
376                    params: &[f64],
377                    inputs: &Matrix<f64>,
378                    targets: &Matrix<f64>)
379                    -> (f64, Vec<f64>) {
380        self.compute_grad(params, inputs, targets)
381    }
382}
383
384/// Criterion for Neural Networks
385///
386/// Specifies an activation function and a cost function.
387pub trait Criterion {
388    /// The activation function for the criterion.
389    type ActFunc: ActivationFunc;
390    /// The cost function for the criterion.
391    type Cost: CostFunc<Matrix<f64>>;
392
393    /// The activation function applied to a matrix.
394    fn activate(&self, mat: Matrix<f64>) -> Matrix<f64> {
395        mat.apply(&Self::ActFunc::func)
396    }
397
398    /// The gradient of the activation function applied to a matrix.
399    fn grad_activ(&self, mat: Matrix<f64>) -> Matrix<f64> {
400        mat.apply(&Self::ActFunc::func_grad)
401    }
402
403    /// The cost function.
404    ///
405    /// Returns a scalar cost.
406    fn cost(&self, outputs: &Matrix<f64>, targets: &Matrix<f64>) -> f64 {
407        Self::Cost::cost(outputs, targets)
408    }
409
410    /// The gradient of the cost function.
411    ///
412    /// Returns a matrix of cost gradients.
413    fn cost_grad(&self, outputs: &Matrix<f64>, targets: &Matrix<f64>) -> Matrix<f64> {
414        Self::Cost::grad_cost(outputs, targets)
415    }
416
417    /// Returns the regularization for this criterion.
418    ///
419    /// Will return `Regularization::None` by default.
420    fn regularization(&self) -> Regularization<f64> {
421        Regularization::None
422    }
423
424    /// Checks if the current criterion includes regularization.
425    ///
426    /// Will return `false` by default.
427    fn is_regularized(&self) -> bool {
428        match self.regularization() {
429            Regularization::None => false,
430            _ => true,
431        }
432    }
433
434    /// Returns the regularization cost for the criterion.
435    ///
436    /// Will return `0` by default.
437    ///
438    /// This method will not be invoked by the neural network
439    /// if there is explicitly no regularization.
440    fn reg_cost(&self, reg_weights: MatrixSlice<f64>) -> f64 {
441        self.regularization().reg_cost(reg_weights)
442    }
443
444    /// Returns the regularization gradient for the criterion.
445    ///
446    /// Will return a matrix of zeros by default.
447    ///
448    /// This method will not be invoked by the neural network
449    /// if there is explicitly no regularization.
450    fn reg_cost_grad(&self, reg_weights: MatrixSlice<f64>) -> Matrix<f64> {
451        self.regularization().reg_grad(reg_weights)
452    }
453}
454
455/// The binary cross entropy criterion.
456///
457/// Uses the Sigmoid activation function and the
458/// cross entropy error.
459#[derive(Clone, Copy, Debug)]
460pub struct BCECriterion {
461    regularization: Regularization<f64>,
462}
463
464impl Criterion for BCECriterion {
465    type ActFunc = activ_fn::Sigmoid;
466    type Cost = cost_fn::CrossEntropyError;
467
468    fn regularization(&self) -> Regularization<f64> {
469        self.regularization
470    }
471}
472
473/// Creates an MSE Criterion without any regularization.
474impl Default for BCECriterion {
475    fn default() -> Self {
476        BCECriterion { regularization: Regularization::None }
477    }
478}
479
480impl BCECriterion {
481    /// Constructs a new BCECriterion with the given regularization.
482    ///
483    /// # Examples
484    ///
485    /// ```
486    /// use rusty_machine::learning::nnet::BCECriterion;
487    /// use rusty_machine::learning::toolkit::regularization::Regularization;
488    ///
489    /// // Create a new BCE criterion with L2 regularization of 0.3.
490    /// let criterion = BCECriterion::new(Regularization::L2(0.3f64));
491    /// ```
492    pub fn new(regularization: Regularization<f64>) -> Self {
493        BCECriterion { regularization: regularization }
494    }
495}
496
497/// The mean squared error criterion.
498///
499/// Uses the Linear activation function and the
500/// mean squared error.
501#[derive(Clone, Copy, Debug)]
502pub struct MSECriterion {
503    regularization: Regularization<f64>,
504}
505
506impl Criterion for MSECriterion {
507    type ActFunc = activ_fn::Linear;
508    type Cost = cost_fn::MeanSqError;
509
510    fn regularization(&self) -> Regularization<f64> {
511        self.regularization
512    }
513}
514
515/// Creates an MSE Criterion without any regularization.
516impl Default for MSECriterion {
517    fn default() -> Self {
518        MSECriterion { regularization: Regularization::None }
519    }
520}
521
522impl MSECriterion {
523    /// Constructs a new BCECriterion with the given regularization.
524    ///
525    /// # Examples
526    ///
527    /// ```
528    /// use rusty_machine::learning::nnet::MSECriterion;
529    /// use rusty_machine::learning::toolkit::regularization::Regularization;
530    ///
531    /// // Create a new MSE criterion with L2 regularization of 0.3.
532    /// let criterion = MSECriterion::new(Regularization::L2(0.3f64));
533    /// ```
534    pub fn new(regularization: Regularization<f64>) -> Self {
535        MSECriterion { regularization: regularization }
536    }
537}