rusty_machine/learning/nnet.rs
1//! Neural Network module
2//!
3//! Contains implementation of simple feed forward neural network.
4//!
5//! # Usage
6//!
7//! ```
8//! use rusty_machine::learning::nnet::{NeuralNet, BCECriterion};
9//! use rusty_machine::learning::toolkit::regularization::Regularization;
10//! use rusty_machine::learning::optim::grad_desc::StochasticGD;
11//! use rusty_machine::linalg::Matrix;
12//! use rusty_machine::learning::SupModel;
13//!
14//! let inputs = Matrix::new(5,3, vec![1.,1.,1.,2.,2.,2.,3.,3.,3.,
15//! 4.,4.,4.,5.,5.,5.,]);
16//! let targets = Matrix::new(5,3, vec![1.,0.,0.,0.,1.,0.,0.,0.,1.,
17//! 0.,0.,1.,0.,0.,1.]);
18//!
19//! // Set the layer sizes - from input to output
20//! let layers = &[3,5,11,7,3];
21//!
22//! // Choose the BCE criterion with L2 regularization (`lambda=0.1`).
23//! let criterion = BCECriterion::new(Regularization::L2(0.1));
24//!
25//! // We will just use the default stochastic gradient descent.
26//! let mut model = NeuralNet::new(layers, criterion, StochasticGD::default());
27//!
28//! // Train the model!
29//! model.train(&inputs, &targets).unwrap();
30//!
31//! let test_inputs = Matrix::new(2,3, vec![1.5,1.5,1.5,5.1,5.1,5.1]);
32//!
33//! // And predict new output from the test inputs
34//! let outputs = model.predict(&test_inputs).unwrap();
35//! ```
36//!
37//! The neural networks are specified via a criterion - similar to
38//! [Torch](https://github.com/torch/nn/blob/master/doc/criterion.md).
39//! The criterions combine an activation function and a cost function.
40//!
41//! You can define your own criterion by implementing the `Criterion`
42//! trait with a concrete `ActivationFunc` and `CostFunc`.
43
44use linalg::{Matrix, MatrixSlice, BaseMatrix, BaseMatrixMut};
45
46use learning::{LearningResult, SupModel};
47use learning::error::{Error, ErrorKind};
48use learning::toolkit::activ_fn;
49use learning::toolkit::activ_fn::ActivationFunc;
50use learning::toolkit::cost_fn;
51use learning::toolkit::cost_fn::CostFunc;
52use learning::toolkit::regularization::Regularization;
53use learning::optim::{Optimizable, OptimAlgorithm};
54use learning::optim::grad_desc::StochasticGD;
55
56use rand::thread_rng;
57use rand::distributions::{Sample, range};
58
59/// Neural Network Model
60///
61/// The Neural Network struct specifies a Criterion and
62/// a gradient descent algorithm.
63#[derive(Debug)]
64pub struct NeuralNet<'a, T, A>
65 where T: Criterion,
66 A: OptimAlgorithm<BaseNeuralNet<'a, T>>
67{
68 base: BaseNeuralNet<'a, T>,
69 alg: A,
70}
71
72/// Supervised learning for the Neural Network.
73///
74/// The model is trained using back propagation.
75impl<'a, T, A> SupModel<Matrix<f64>, Matrix<f64>> for NeuralNet<'a, T, A>
76 where T: Criterion,
77 A: OptimAlgorithm<BaseNeuralNet<'a, T>>
78{
79 /// Predict neural network output using forward propagation.
80 fn predict(&self, inputs: &Matrix<f64>) -> LearningResult<Matrix<f64>> {
81 self.base.forward_prop(inputs)
82 }
83
84 /// Train the model using gradient optimization and back propagation.
85 fn train(&mut self, inputs: &Matrix<f64>, targets: &Matrix<f64>) -> LearningResult<()> {
86 let optimal_w = self.alg.optimize(&self.base, &self.base.weights, inputs, targets);
87 self.base.weights = optimal_w;
88 Ok(())
89 }
90}
91
92impl<'a> NeuralNet<'a, BCECriterion, StochasticGD> {
93 /// Creates a neural network with the specified layer sizes.
94 ///
95 /// The layer sizes slice should include the input, hidden layers, and output layer sizes.
96 /// The type of activation function must be specified.
97 ///
98 /// Uses the default settings (stochastic gradient descent and sigmoid activation function).
99 ///
100 /// # Examples
101 ///
102 /// ```
103 /// use rusty_machine::learning::nnet::NeuralNet;
104 ///
105 /// // Create a neural net with 4 layers, 3 neurons in each.
106 /// let layers = &[3; 4];
107 /// let mut net = NeuralNet::default(layers);
108 /// ```
109 pub fn default(layer_sizes: &[usize]) -> NeuralNet<BCECriterion, StochasticGD> {
110 NeuralNet {
111 base: BaseNeuralNet::default(layer_sizes),
112 alg: StochasticGD::default(),
113 }
114 }
115}
116
117impl<'a, T, A> NeuralNet<'a, T, A>
118 where T: Criterion,
119 A: OptimAlgorithm<BaseNeuralNet<'a, T>>
120{
121 /// Create a new neural network with the specified layer sizes.
122 ///
123 /// The layer sizes slice should include the input, hidden layers, and output layer sizes.
124 /// The type of activation function must be specified.
125 ///
126 /// Currently defaults to simple batch Gradient Descent for optimization.
127 ///
128 /// # Examples
129 ///
130 /// ```
131 /// use rusty_machine::learning::nnet::BCECriterion;
132 /// use rusty_machine::learning::nnet::NeuralNet;
133 /// use rusty_machine::learning::optim::grad_desc::StochasticGD;
134 ///
135 /// // Create a neural net with 4 layers, 3 neurons in each.
136 /// let layers = &[3; 4];
137 /// let mut net = NeuralNet::new(layers, BCECriterion::default(), StochasticGD::default());
138 /// ```
139 pub fn new(layer_sizes: &'a [usize], criterion: T, alg: A) -> NeuralNet<'a, T, A> {
140 NeuralNet {
141 base: BaseNeuralNet::new(layer_sizes, criterion),
142 alg: alg,
143 }
144 }
145
146 /// Gets matrix of weights between specified layer and forward layer.
147 ///
148 /// # Examples
149 ///
150 /// ```
151 /// use rusty_machine::linalg::BaseMatrix;
152 /// use rusty_machine::learning::nnet::NeuralNet;
153 ///
154 /// // Create a neural net with 4 layers, 3 neurons in each.
155 /// let layers = &[3; 4];
156 /// let mut net = NeuralNet::default(layers);
157 ///
158 /// let w = &net.get_net_weights(2);
159 ///
160 /// // We add a bias term to the weight matrix
161 /// assert_eq!(w.rows(), 4);
162 /// assert_eq!(w.cols(), 3);
163 /// ```
164 pub fn get_net_weights(&self, idx: usize) -> MatrixSlice<f64> {
165 self.base.get_layer_weights(&self.base.weights[..], idx)
166 }
167}
168
169/// Base Neural Network struct
170///
171/// This struct cannot be instantianated and is used internally only.
172#[derive(Debug)]
173pub struct BaseNeuralNet<'a, T: Criterion> {
174 layer_sizes: &'a [usize],
175 weights: Vec<f64>,
176 criterion: T,
177}
178
179
180impl<'a> BaseNeuralNet<'a, BCECriterion> {
181 /// Creates a base neural network with the specified layer sizes.
182 fn default(layer_sizes: &[usize]) -> BaseNeuralNet<BCECriterion> {
183 BaseNeuralNet::new(layer_sizes, BCECriterion::default())
184 }
185}
186
187
188impl<'a, T: Criterion> BaseNeuralNet<'a, T> {
189 /// Create a new base neural network with the specified layer sizes.
190 fn new(layer_sizes: &[usize], criterion: T) -> BaseNeuralNet<T> {
191 BaseNeuralNet {
192 layer_sizes: layer_sizes,
193 weights: BaseNeuralNet::<T>::create_weights(layer_sizes),
194 criterion: criterion,
195 }
196 }
197
198 /// Creates initial weights for all neurons in the network.
199 fn create_weights(layer_sizes: &[usize]) -> Vec<f64> {
200 let mut between = range::Range::new(0f64, 1f64);
201 let mut rng = thread_rng();
202 layer_sizes.windows(2)
203 .flat_map(|w| {
204 let l_in = w[0] + 1;
205 let l_out = w[1];
206 let eps_init = (6f64 / (l_in + l_out) as f64).sqrt();
207 (0..l_in * l_out)
208 .map(|_i| (between.sample(&mut rng) * 2f64 * eps_init) - eps_init)
209 .collect::<Vec<_>>()
210 })
211 .collect()
212 }
213
214 /// Gets matrix of weights between specified layer and forward layer for the weights.
215 fn get_layer_weights(&self, weights: &[f64], idx: usize) -> MatrixSlice<f64> {
216 debug_assert!(idx < self.layer_sizes.len() - 1);
217
218 // Check that the weights are the right size.
219 let mut full_size = 0usize;
220 for l in 0..self.layer_sizes.len() - 1 {
221 full_size += (self.layer_sizes[l] + 1) * self.layer_sizes[l + 1];
222 }
223
224 debug_assert_eq!(full_size, weights.len());
225
226 let mut start = 0usize;
227
228 for l in 0..idx {
229 start += (self.layer_sizes[l] + 1) * self.layer_sizes[l + 1]
230 }
231
232 unsafe {
233 MatrixSlice::from_raw_parts(weights.as_ptr().offset(start as isize),
234 self.layer_sizes[idx] + 1,
235 self.layer_sizes[idx + 1],
236 self.layer_sizes[idx + 1])
237 }
238
239 }
240
241 /// Gets matrix of weights between specified layer and forward layer
242 /// for the base model.
243 fn get_net_weights(&self, idx: usize) -> MatrixSlice<f64> {
244 self.get_layer_weights(&self.weights[..], idx)
245 }
246
247 /// Gets the weights for a layer excluding the bias weights.
248 fn get_non_bias_weights(&self, weights: &[f64], idx: usize) -> MatrixSlice<f64> {
249 let layer_weights = self.get_layer_weights(weights, idx);
250 layer_weights.reslice([1, 0], layer_weights.rows() - 1, layer_weights.cols())
251 }
252
253 /// Compute the gradient using the back propagation algorithm.
254 fn compute_grad(&self,
255 weights: &[f64],
256 inputs: &Matrix<f64>,
257 targets: &Matrix<f64>)
258 -> (f64, Vec<f64>) {
259 assert_eq!(inputs.cols(), self.layer_sizes[0]);
260
261 let mut forward_weights = Vec::with_capacity(self.layer_sizes.len() - 1);
262 let mut activations = Vec::with_capacity(self.layer_sizes.len());
263
264 let net_data = Matrix::ones(inputs.rows(), 1).hcat(inputs);
265
266 activations.push(net_data.clone());
267
268 // Forward propagation
269 {
270 let mut z = net_data * self.get_layer_weights(weights, 0);
271 forward_weights.push(z.clone());
272
273 for l in 1..self.layer_sizes.len() - 1 {
274 let mut a = self.criterion.activate(z.clone());
275 let ones = Matrix::ones(a.rows(), 1);
276
277 a = ones.hcat(&a);
278
279 z = &a * self.get_layer_weights(weights, l);
280 activations.push(a);
281 forward_weights.push(z.clone());
282 }
283
284 activations.push(self.criterion.activate(z));
285 }
286
287 let mut deltas = Vec::with_capacity(self.layer_sizes.len() - 1);
288 // Backward propagation
289 {
290 let z = forward_weights[self.layer_sizes.len() - 2].clone();
291 let g = self.criterion.grad_activ(z);
292
293 // Take GRAD_cost to compute this delta.
294 let mut delta = self.criterion
295 .cost_grad(&activations[self.layer_sizes.len() - 1], targets)
296 .elemul(&g);
297
298 deltas.push(delta.clone());
299
300 for l in (1..self.layer_sizes.len() - 1).rev() {
301 let mut z = forward_weights[l - 1].clone();
302 let ones = Matrix::ones(z.rows(), 1);
303 z = ones.hcat(&z);
304
305 let g = self.criterion.grad_activ(z);
306 delta = (delta * Matrix::from(self.get_layer_weights(weights, l)).transpose())
307 .elemul(&g);
308
309 let non_one_rows = &(1..delta.cols()).collect::<Vec<usize>>()[..];
310 delta = delta.select_cols(non_one_rows);
311 deltas.push(delta.clone());
312 }
313 }
314
315 let mut gradients = Vec::with_capacity(weights.len());
316
317 for (l, activ_item) in activations.iter().take(self.layer_sizes.len() - 1).enumerate() {
318 // Compute the gradient
319 let mut g = deltas[self.layer_sizes.len() - 2 - l].transpose() * activ_item;
320
321 // Add the regularized gradient
322 if self.criterion.is_regularized() {
323 let layer = l;
324 let non_bias_weights = self.get_non_bias_weights(weights, layer);
325 let zeros = Matrix::zeros(1, non_bias_weights.cols());
326 g += zeros.vcat(&self.criterion.reg_cost_grad(non_bias_weights));
327 }
328
329 gradients.append(&mut (g / inputs.rows() as f64).into_vec());
330 }
331
332 // Compute the cost
333 let mut cost = self.criterion.cost(&activations[activations.len() - 1], targets);
334
335 // Add the regularized cost
336 if self.criterion.is_regularized() {
337 for i in 0..self.layer_sizes.len() - 1 {
338 cost += self.criterion.reg_cost(self.get_non_bias_weights(weights, i));
339 }
340 }
341
342 (cost, gradients)
343 }
344
345 /// Forward propagation of the model weights to get the outputs.
346 fn forward_prop(&self, inputs: &Matrix<f64>) -> LearningResult<Matrix<f64>> {
347 if inputs.cols() != self.layer_sizes[0] {
348 Err(Error::new(ErrorKind::InvalidData,
349 "The input data dimensions must match the first layer."))
350 } else {
351 let net_data = Matrix::ones(inputs.rows(), 1).hcat(inputs);
352
353 let mut z = net_data * self.get_net_weights(0);
354 let mut a = self.criterion.activate(z.clone());
355
356 for l in 1..self.layer_sizes.len() - 1 {
357 let ones = Matrix::ones(a.rows(), 1);
358 a = ones.hcat(&a);
359 z = a * self.get_net_weights(l);
360 a = self.criterion.activate(z.clone());
361 }
362
363 Ok(a)
364 }
365 }
366}
367
368/// Compute the gradient of the Neural Network using the
369/// back propagation algorithm.
370impl<'a, T: Criterion> Optimizable for BaseNeuralNet<'a, T> {
371 type Inputs = Matrix<f64>;
372 type Targets = Matrix<f64>;
373
374 /// Compute the gradient of the neural network.
375 fn compute_grad(&self,
376 params: &[f64],
377 inputs: &Matrix<f64>,
378 targets: &Matrix<f64>)
379 -> (f64, Vec<f64>) {
380 self.compute_grad(params, inputs, targets)
381 }
382}
383
384/// Criterion for Neural Networks
385///
386/// Specifies an activation function and a cost function.
387pub trait Criterion {
388 /// The activation function for the criterion.
389 type ActFunc: ActivationFunc;
390 /// The cost function for the criterion.
391 type Cost: CostFunc<Matrix<f64>>;
392
393 /// The activation function applied to a matrix.
394 fn activate(&self, mat: Matrix<f64>) -> Matrix<f64> {
395 mat.apply(&Self::ActFunc::func)
396 }
397
398 /// The gradient of the activation function applied to a matrix.
399 fn grad_activ(&self, mat: Matrix<f64>) -> Matrix<f64> {
400 mat.apply(&Self::ActFunc::func_grad)
401 }
402
403 /// The cost function.
404 ///
405 /// Returns a scalar cost.
406 fn cost(&self, outputs: &Matrix<f64>, targets: &Matrix<f64>) -> f64 {
407 Self::Cost::cost(outputs, targets)
408 }
409
410 /// The gradient of the cost function.
411 ///
412 /// Returns a matrix of cost gradients.
413 fn cost_grad(&self, outputs: &Matrix<f64>, targets: &Matrix<f64>) -> Matrix<f64> {
414 Self::Cost::grad_cost(outputs, targets)
415 }
416
417 /// Returns the regularization for this criterion.
418 ///
419 /// Will return `Regularization::None` by default.
420 fn regularization(&self) -> Regularization<f64> {
421 Regularization::None
422 }
423
424 /// Checks if the current criterion includes regularization.
425 ///
426 /// Will return `false` by default.
427 fn is_regularized(&self) -> bool {
428 match self.regularization() {
429 Regularization::None => false,
430 _ => true,
431 }
432 }
433
434 /// Returns the regularization cost for the criterion.
435 ///
436 /// Will return `0` by default.
437 ///
438 /// This method will not be invoked by the neural network
439 /// if there is explicitly no regularization.
440 fn reg_cost(&self, reg_weights: MatrixSlice<f64>) -> f64 {
441 self.regularization().reg_cost(reg_weights)
442 }
443
444 /// Returns the regularization gradient for the criterion.
445 ///
446 /// Will return a matrix of zeros by default.
447 ///
448 /// This method will not be invoked by the neural network
449 /// if there is explicitly no regularization.
450 fn reg_cost_grad(&self, reg_weights: MatrixSlice<f64>) -> Matrix<f64> {
451 self.regularization().reg_grad(reg_weights)
452 }
453}
454
455/// The binary cross entropy criterion.
456///
457/// Uses the Sigmoid activation function and the
458/// cross entropy error.
459#[derive(Clone, Copy, Debug)]
460pub struct BCECriterion {
461 regularization: Regularization<f64>,
462}
463
464impl Criterion for BCECriterion {
465 type ActFunc = activ_fn::Sigmoid;
466 type Cost = cost_fn::CrossEntropyError;
467
468 fn regularization(&self) -> Regularization<f64> {
469 self.regularization
470 }
471}
472
473/// Creates an MSE Criterion without any regularization.
474impl Default for BCECriterion {
475 fn default() -> Self {
476 BCECriterion { regularization: Regularization::None }
477 }
478}
479
480impl BCECriterion {
481 /// Constructs a new BCECriterion with the given regularization.
482 ///
483 /// # Examples
484 ///
485 /// ```
486 /// use rusty_machine::learning::nnet::BCECriterion;
487 /// use rusty_machine::learning::toolkit::regularization::Regularization;
488 ///
489 /// // Create a new BCE criterion with L2 regularization of 0.3.
490 /// let criterion = BCECriterion::new(Regularization::L2(0.3f64));
491 /// ```
492 pub fn new(regularization: Regularization<f64>) -> Self {
493 BCECriterion { regularization: regularization }
494 }
495}
496
497/// The mean squared error criterion.
498///
499/// Uses the Linear activation function and the
500/// mean squared error.
501#[derive(Clone, Copy, Debug)]
502pub struct MSECriterion {
503 regularization: Regularization<f64>,
504}
505
506impl Criterion for MSECriterion {
507 type ActFunc = activ_fn::Linear;
508 type Cost = cost_fn::MeanSqError;
509
510 fn regularization(&self) -> Regularization<f64> {
511 self.regularization
512 }
513}
514
515/// Creates an MSE Criterion without any regularization.
516impl Default for MSECriterion {
517 fn default() -> Self {
518 MSECriterion { regularization: Regularization::None }
519 }
520}
521
522impl MSECriterion {
523 /// Constructs a new BCECriterion with the given regularization.
524 ///
525 /// # Examples
526 ///
527 /// ```
528 /// use rusty_machine::learning::nnet::MSECriterion;
529 /// use rusty_machine::learning::toolkit::regularization::Regularization;
530 ///
531 /// // Create a new MSE criterion with L2 regularization of 0.3.
532 /// let criterion = MSECriterion::new(Regularization::L2(0.3f64));
533 /// ```
534 pub fn new(regularization: Regularization<f64>) -> Self {
535 MSECriterion { regularization: regularization }
536 }
537}