juice/layers/common/
rnn.rs

1//! Create a Recursive Layer
2//!
3//! Recurrent Neural Network Layer
4//! A type of Neural Network that can process data in sequence, with temporal understanding of
5//! one element of data flowing into the next. This type of understanding is suitable for tasks such
6//! as translating a sentence, mimicking the patterns in a musical piece, or time series forecasting.
7//!
8//! Currently this is implemented in CUDA, but not in native or opencl.
9//!
10//! ## CUDA Specific Notes - Using Juice
11//! CUDA currently supports GRU, LSTM, ReLU, and tanh for LSTM operations.
12//! All of these can be uni or bi-directional.
13//!
14//! All of these perform better when Tensor Core are available, this has some pretty stringent
15//! requirements (https://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html#tensor_ops);
16//!
17//! For Standard Algorithm - CUDNN_RNN_ALGO_STANDARD in Cuda Docs or RnnAlgorithm::Standard in Juice,
18//! * hidden size, input size, and batch size must be a multiple of 8
19//! * All user-provided tensors, workspace, and reserve space are aligned to 128 bit boundaries.
20//! * Math Type CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION (MathType::TensorOPMathAllowConversion) is selected.
21// TODO: Ensure workspace & reserve-space are aligned to 128 bit boundaries.
22//!
23//! ## CUDA Specific Notes - Developing Juice
24//! The following resources are your best bet for debugging an issue within Juice.
25//! Generic Docs https://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html
26//! API Docs https://docs.nvidia.com/deeplearning/sdk/cudnn-api/index.html
27//!
28//! We're aiming to support the latest features in CUDNN, and promise no support for outdated
29//! versions of CUDA or CUDNN. Current code has been tested with
30//! | CUDA            | CUDNN              |
31//! |---                   |---                |
32//! | 10.2              | 7.6.5 |
33//!
34//! And the following graphics cards
35//! | Card            |
36//! |---              |
37//! | NVIDIA GeForce GTX 1070 |
38
39use std::rc::Rc;
40use std::sync::{Arc, RwLock};
41
42use conn::{DirectionMode, RnnAlgorithm, RnnInputMode, RnnNetworkMode};
43
44use crate::capnp_util::*;
45use crate::co::prelude::*;
46use crate::conn;
47use crate::conn::RnnConfig as connRnnConfig;
48use crate::juice_capnp::rnn_config as capnp_config;
49use crate::layer::*;
50use crate::util::{native_backend, ArcLock};
51use crate::weight::FillerType;
52
53#[derive(Debug, Clone)]
54///
55pub struct Rnn<B: conn::Rnn<f32>> {
56    hidden_size: usize,
57    num_layers: usize,
58    dropout_probability: f32,
59    dropout_seed: u64,
60    rnn_type: RnnNetworkMode,
61    input_mode: RnnInputMode,
62    direction_mode: DirectionMode,
63    workspace: Option<ArcLock<SharedTensor<u8>>>,
64    rnn_config: Option<Rc<B::CRNN>>,
65}
66
67impl<B: conn::Rnn<f32>> Rnn<B> {
68    /// Create a RNN from a RNNConfig
69    pub fn from_config(config: &RnnConfig) -> Rnn<B> {
70        Rnn {
71            hidden_size: config.hidden_size,
72            num_layers: config.num_layers,
73            dropout_probability: config.dropout_probability,
74            dropout_seed: config.dropout_seed,
75            rnn_type: config.rnn_type,
76            input_mode: config.input_mode,
77            direction_mode: config.direction_mode,
78            workspace: None,
79            rnn_config: None,
80        }
81    }
82}
83
84impl<B: IBackend + conn::Rnn<f32>> ILayer<B> for Rnn<B> {
85    impl_ilayer_common!();
86
87    fn auto_weight_blobs(&self) -> bool {
88        true
89    }
90
91    fn reshape(
92        &mut self,
93        backend: Rc<B>,
94        input_data: &mut Vec<ArcLock<SharedTensor<f32>>>,
95        input_gradient: &mut Vec<ArcLock<SharedTensor<f32>>>,
96        weights_data: &mut Vec<ArcLock<SharedTensor<f32>>>,
97        weights_gradient: &mut Vec<ArcLock<SharedTensor<f32>>>,
98        output_data: &mut Vec<ArcLock<SharedTensor<f32>>>,
99        output_gradient: &mut Vec<ArcLock<SharedTensor<f32>>>,
100    ) {
101        let input = input_data[0].read().unwrap();
102        let mut output_data = output_data[0].write().unwrap();
103        let mut output_gradient = output_gradient[0].write().unwrap();
104
105        // Input Shape is Batch, Number of Inputs, Sequence Length
106        let input_shape = input.desc();
107        let batch_size = input_shape[0];
108        let input_size = input_shape[1];
109        let sequence_length = input_shape[2];
110
111        let hidden_size = self.hidden_size;
112
113        let output_shape = &[batch_size, hidden_size, self.num_layers];
114        input_gradient[0].write().unwrap().resize(input_shape).unwrap();
115        output_data.resize(output_shape).unwrap();
116        output_gradient.resize(output_shape).unwrap();
117
118        let config = backend
119            .new_rnn_config(
120                &input,
121                Some(self.dropout_probability),
122                Some(self.dropout_seed),
123                sequence_length as i32,
124                self.rnn_type,
125                self.input_mode,
126                self.direction_mode,
127                // Standard is likely to be effective across most parameters. This should be
128                // calculated internal to Juice if modified, allowing user input is likely to be
129                // more confusing than helpful to the end user.
130                // https://docs.nvidia.com/deeplearning/sdk/cudnn-api/index.html#cudnnRNNAlgo_t
131                // Lists the differences and how we can pick between Algorithms automatically
132                RnnAlgorithm::Standard,
133                hidden_size as i32,
134                self.num_layers as i32,
135                batch_size as i32,
136            )
137            .unwrap();
138
139        let filter_dimensions: TensorDesc = backend
140            .generate_rnn_weight_description(&config, batch_size as i32, input_size as i32)
141            .unwrap();
142
143        // weights
144        weights_data[0].write().unwrap().resize(&filter_dimensions).unwrap();
145        // biases
146        weights_data[1].write().unwrap().resize(&(1, self.hidden_size)).unwrap();
147
148        let filler = FillerType::Glorot {
149            input_size: filter_dimensions.clone().size(),
150            output_size: batch_size * self.num_layers * self.hidden_size,
151        };
152
153        let bias_filler = FillerType::Constant { value: 1.0 };
154
155        filler.fill(&mut weights_data[0].write().unwrap());
156        bias_filler.fill(&mut weights_data[1].write().unwrap());
157
158        weights_gradient[0].write().unwrap().resize(&filter_dimensions).unwrap();
159        weights_gradient[1].write().unwrap().resize(&filter_dimensions).unwrap();
160
161        self.rnn_config = Some(Rc::new(config));
162    }
163
164    fn resize_shared_workspace(
165        &mut self,
166        backend: Rc<B>,
167        workspace: Option<ArcLock<SharedTensor<u8>>>,
168    ) -> Option<ArcLock<SharedTensor<u8>>> {
169        let required_size = self.rnn_config.as_ref().unwrap().workspace_size();
170
171        if let Some(old_workspace) = workspace.clone() {
172            let old_workspace_size = old_workspace.read().unwrap().capacity();
173            if old_workspace_size >= required_size {
174                return Some(old_workspace);
175            }
176        }
177        self.workspace = Some(Arc::new(RwLock::new(SharedTensor::<u8>::new(&[required_size]))));
178        self.workspace.clone()
179    }
180}
181
182impl<B: IBackend + conn::Rnn<f32>> ComputeOutput<f32, B> for Rnn<B> {
183    fn compute_output(
184        &self,
185        backend: &B,
186        weights: &[&SharedTensor<f32>],
187        input_data: &[&SharedTensor<f32>],
188        output_data: &mut [&mut SharedTensor<f32>],
189    ) {
190        let input_shape = input_data[0].desc();
191        let batch_size = input_shape[0];
192        let input_size = input_shape[1];
193        let sequence_length = input_shape[2];
194        let rnn_config = self.rnn_config.as_ref().unwrap();
195        let mut workspace = self.workspace.as_ref().unwrap().write().unwrap();
196        backend
197            .rnn_forward(&input_data[0], output_data[0], rnn_config, weights[0], &mut workspace)
198            .unwrap();
199    }
200}
201
202impl<B: IBackend + conn::Rnn<f32>> ComputeInputGradient<f32, B> for Rnn<B> {
203    fn compute_input_gradient(
204        &self,
205        backend: &B,
206        weights_data: &[&SharedTensor<f32>],
207        output_data: &[&SharedTensor<f32>],
208        output_gradients: &[&SharedTensor<f32>],
209        input_data: &[&SharedTensor<f32>],
210        input_gradients: &mut [&mut SharedTensor<f32>],
211    ) {
212        let rnn_config = self.rnn_config.as_ref().unwrap();
213        let mut workspace = self.workspace.as_ref().unwrap().write().unwrap();
214
215        let src = input_data[0];
216        let input_shape = src.desc();
217        let batch_size = input_shape[0];
218        let input_size = input_shape[1];
219        let sequence_length = input_shape[2];
220        let native_backend = native_backend();
221        let readable_input = src.read(native_backend.device()).unwrap().as_slice::<f32>().to_vec();
222
223        backend
224            .rnn_backward_data(
225                &input_data[0],
226                input_gradients[0],
227                &output_data[0],
228                output_gradients[0],
229                rnn_config,
230                weights_data[0],
231                &mut workspace,
232            )
233            .unwrap();
234    }
235}
236
237impl<B: IBackend + conn::Rnn<f32>> ComputeParametersGradient<f32, B> for Rnn<B> {
238    fn compute_parameters_gradient(
239        &self,
240        backend: &B,
241        output_data: &[&SharedTensor<f32>],
242        output_gradients: &[&SharedTensor<f32>],
243        input_data: &[&SharedTensor<f32>],
244        parameters_gradients: &mut [&mut SharedTensor<f32>],
245    ) {
246        let rnn_config = self.rnn_config.as_ref().unwrap();
247        let mut workspace = self.workspace.as_ref().unwrap().write().unwrap();
248
249        // weights
250        backend
251            .rnn_backward_weights(
252                &input_data[0],
253                &output_data[0],
254                &mut parameters_gradients[0],
255                rnn_config,
256                &mut workspace,
257            )
258            .unwrap();
259
260        // bias
261        backend
262            .rnn_backward_weights(
263                &input_data[0],
264                &output_data[0],
265                &mut parameters_gradients[1],
266                rnn_config,
267                &mut workspace,
268            )
269            .unwrap();
270    }
271}
272
273#[derive(Debug, Clone, Copy)]
274/// Specifies configuration parameters for a RNN Layer.
275/// TODO: Update to RnnConfig in CUDA Layer
276pub struct RnnConfig {
277    /// Size of the Hidden Layer
278    pub hidden_size: usize,
279    /// Number of Hidden Layers
280    pub num_layers: usize,
281    /// Type of RNN
282    pub rnn_type: RnnNetworkMode,
283    /// Dropout Probability
284    pub dropout_probability: f32,
285    /// Dropout Seed
286    pub dropout_seed: u64,
287    /// Input Mode
288    pub input_mode: RnnInputMode,
289    /// RNN Direction
290    pub direction_mode: DirectionMode,
291}
292
293impl Into<LayerType> for RnnConfig {
294    fn into(self) -> LayerType {
295        LayerType::Rnn(self)
296    }
297}
298
299impl<'a> CapnpWrite<'a> for RnnConfig {
300    type Builder = capnp_config::Builder<'a>;
301
302    /// Write the RnnConfig into a capnp message.
303    fn write_capnp(&self, builder: &mut Self::Builder) {
304        builder.reborrow().set_num_layers(self.num_layers as u64);
305        builder.reborrow().set_hidden_size(self.hidden_size as u64);
306        builder.reborrow().set_rnn_type(&self.rnn_type.to_string());
307        builder.reborrow().set_dropout_probability(self.dropout_probability);
308        builder.reborrow().set_dropout_seed(self.dropout_seed);
309        builder.reborrow().set_input_mode(&self.input_mode.to_string());
310        builder.reborrow().set_direction_mode(&self.direction_mode.to_string());
311    }
312}
313
314impl<'a> CapnpRead<'a> for RnnConfig {
315    type Reader = capnp_config::Reader<'a>;
316
317    fn read_capnp(reader: Self::Reader) -> Self {
318        let read_num_layers = reader.get_num_layers() as usize;
319        let read_hidden_size = reader.get_hidden_size() as usize;
320        let read_dropout_probability = reader.get_dropout_probability();
321        let read_dropout_seed = reader.get_dropout_seed();
322        let read_rnn_type = RnnNetworkMode::from_string(reader.get_rnn_type().unwrap()).unwrap();
323        let read_input_mode = RnnInputMode::from_string(reader.get_input_mode().unwrap()).unwrap();
324        let read_direction_mode = DirectionMode::from_string(reader.get_direction_mode().unwrap()).unwrap();
325
326        RnnConfig {
327            hidden_size: read_hidden_size,
328            num_layers: read_num_layers,
329            rnn_type: read_rnn_type,
330            dropout_seed: read_dropout_seed,
331            dropout_probability: read_dropout_probability,
332            input_mode: read_input_mode,
333            direction_mode: read_direction_mode,
334        }
335    }
336}
337
338#[cfg(test)]
339mod tests {
340    use std::rc::Rc;
341
342    use conn::Rnn as coRnn;
343    use conn::{DirectionMode, RnnAlgorithm, RnnInputMode, RnnNetworkMode};
344
345    #[cfg(feature = "cuda")]
346    use crate::co::frameworks::cuda::get_cuda_backend as cuda_backend;
347    use crate::co::*;
348    use crate::layer::{ComputeInputGradient, ComputeOutput, ComputeParametersGradient, ILayer};
349    use crate::util::native_backend;
350    use crate::weight::FillerType;
351
352    use super::{Rnn, RnnConfig};
353
354    fn sample_input_64() -> Vec<f32> {
355        vec![
356            // Default Input Type - Batch of 8 Elements, 8 Time Parts, Width 1, Height 1.
357            0.5f32;64
358        ]
359    }
360
361    fn sample_input_25() -> Vec<f32> {
362        vec![
363            // Default Input Type - Batch of 5 Elements, 5 Time Parts, Width 1, Height 1.
364            0.5f32;25
365        ]
366    }
367
368    fn sample_output() -> &'static [f32] {
369        [0.6639924, 0.5426032, 0.7527217, 0.3648719, 0.6244233].as_ref()
370    }
371
372    #[test]
373    #[cfg(feature = "cuda")]
374    fn rnn_create_layer() {
375        let cfg = RnnConfig {
376            hidden_size: 8,
377            num_layers: 2,
378            dropout_probability: 0.5,
379            dropout_seed: 0,
380            rnn_type: RnnNetworkMode::LSTM,
381            input_mode: RnnInputMode::LinearInput,
382            direction_mode: DirectionMode::UniDirectional,
383        };
384
385        let native_backend = native_backend();
386        let backend = cuda_backend();
387
388        let batch_size = 5_usize;
389        let sequence_length = 5_usize;
390        let height = 1_usize;
391        let width = 1_usize;
392
393        let hidden_size = cfg.hidden_size;
394        let num_layers = cfg.num_layers;
395
396        let input_shape = &(batch_size, sequence_length, height, width);
397        let mut layer = Rnn::<Backend<Cuda>>::from_config(&cfg);
398
399        let mut input_data = SharedTensor::<f32>::new(input_shape);
400        input_data
401            .write_only(native_backend.device())
402            .unwrap()
403            .as_mut_slice()
404            .copy_from_slice(&sample_input_25());
405
406        let input_shape = input_data.desc();
407
408        let output_shape = &[input_shape[0], input_shape[1], num_layers];
409        let output_data = SharedTensor::<f32>::new(output_shape);
410
411        layer.rnn_config = Some(Rc::from(
412            backend
413                .new_rnn_config(
414                    &input_data,
415                    None,
416                    None,
417                    sequence_length as i32,
418                    RnnNetworkMode::LSTM,
419                    RnnInputMode::LinearInput,
420                    DirectionMode::UniDirectional,
421                    RnnAlgorithm::Standard,
422                    hidden_size as i32,
423                    num_layers as i32,
424                    input_shape[0] as i32,
425                )
426                .unwrap(),
427        ));
428    }
429
430    #[test]
431    #[cfg(feature = "cuda")]
432    fn rnn_roundtrip_pass() {
433        let _ = env_logger::builder()
434            .is_test(true)
435            .filter_level(log::LevelFilter::Trace)
436            .try_init();
437
438        let backend: Backend<Cuda> = cuda_backend();
439        const SEQUENCE_LENGTH: usize = 7;
440        const HIDDEN_SIZE: usize = 5;
441        const NUM_LAYERS: usize = 3;
442        const BATCH_SIZE: usize = 2;
443        const INPUT_SIZE: usize = 11;
444
445        let cfg = RnnConfig {
446            hidden_size: HIDDEN_SIZE,
447            num_layers: NUM_LAYERS,
448            dropout_probability: 0.5,
449            dropout_seed: 1337,
450            rnn_type: RnnNetworkMode::LSTM,
451            input_mode: RnnInputMode::LinearInput,
452            direction_mode: DirectionMode::UniDirectional,
453        };
454
455        let native_backend = native_backend();
456        let mut layer = Rnn::<Backend<Cuda>>::from_config(&cfg);
457
458        let input_shape = vec![BATCH_SIZE, INPUT_SIZE, 1, 1];
459
460        let mut input_data = SharedTensor::<f32>::new(&input_shape);
461        let mut input_gradients = SharedTensor::<f32>::new(&input_shape);
462
463        let data = std::iter::repeat(0.5_f32)
464            .take(BATCH_SIZE * INPUT_SIZE)
465            .collect::<Vec<f32>>();
466        input_data
467            .write_only(native_backend.device())
468            .unwrap()
469            .as_mut_slice()
470            .copy_from_slice(&data);
471
472        let output_shape = vec![BATCH_SIZE, HIDDEN_SIZE, 1];
473
474        let mut output_data = SharedTensor::<f32>::new(&output_shape);
475
476        let config = backend
477            .new_rnn_config(
478                &input_data,
479                None,
480                None,
481                SEQUENCE_LENGTH as i32,
482                RnnNetworkMode::LSTM,
483                RnnInputMode::LinearInput,
484                DirectionMode::UniDirectional,
485                RnnAlgorithm::Standard,
486                HIDDEN_SIZE as i32,
487                NUM_LAYERS as i32,
488                BATCH_SIZE as i32,
489            )
490            .unwrap();
491
492        let filter_dimensions = <Backend<Cuda> as conn::Rnn<f32>>::generate_rnn_weight_description(
493            &backend,
494            &config,
495            BATCH_SIZE as i32,
496            INPUT_SIZE as i32,
497        )
498        .unwrap();
499
500        layer.rnn_config = Some(Rc::from(config));
501
502        let mut weights_data = vec![
503            SharedTensor::<f32>::new(&filter_dimensions),
504            SharedTensor::<f32>::new(&filter_dimensions), // bias
505        ];
506
507        let weights_gradient = vec![
508            SharedTensor::<f32>::new(&filter_dimensions),
509            SharedTensor::<f32>::new(&(1, SEQUENCE_LENGTH)), // bias
510        ];
511
512        let filler = FillerType::Constant { value: 0.02 };
513
514        filler.fill(&mut weights_data[0]);
515        filler.fill(&mut weights_data[1]);
516
517        layer.resize_shared_workspace(Rc::from(cuda_backend()), None);
518
519        layer.compute_output(
520            &backend,
521            &weights_data.iter().collect::<Vec<_>>(),
522            &[&input_data],
523            &mut [&mut output_data],
524        );
525
526        // simulate some feedback
527        let mut output_gradients = SharedTensor::<f32>::new(&output_shape);
528        filler.fill(&mut output_gradients);
529
530        layer.compute_input_gradient(
531            &backend,
532            &weights_data.iter().collect::<Vec<_>>(),
533            &[&output_data],
534            &[&output_gradients],
535            &[&input_data],
536            &mut [&mut input_gradients],
537        );
538
539        layer.compute_parameters_gradient(
540            &backend,
541            &[&output_data],
542            &[&output_gradients],
543            &[&input_data],
544            &mut weights_data.iter_mut().collect::<Vec<_>>(),
545        );
546    }
547}