nevermind_neu/layers/
fc_layer_ocl.rs

1use crate::layers::*;
2use crate::cpu_params::*;
3use crate::ocl::*;
4use crate::util::*;
5
6use log::{debug, warn};
7
8use rand::{thread_rng, Rng, ThreadRng};
9
10use ocl::{Buffer, Context, Device, Kernel, MemFlags, Program, Queue};
11use std::{collections::HashMap, error::Error};
12
13static FC_LAYER_KERNEL_FWD: &'static str = r#"
14    __kernel void fc_layer_product(
15                __private int const batch_size,
16                __private int const prev_shape,
17                __private int const self_shape,
18                __private int const dropout_idx,
19                __private int const dropout_len,
20                __global const float *in,
21                __global const float *bias,
22                __global const float *ws,
23                __global float *out)
24    {
25        uint const idx = get_global_id(0);
26        __private uint const real_idx = idx % self_shape;
27        __private uint const batch_idx = idx / self_shape;
28
29        __private float sum = 0.0;
30
31        for (int j = 0; j < prev_shape; ++j) {
32            sum += ws[real_idx * prev_shape + j] * in[j + prev_shape * batch_idx];
33        }
34
35        out[idx] = activation(sum + bias[real_idx]);
36
37        if (real_idx >= dropout_idx && real_idx < dropout_idx + dropout_len) {
38            out[idx] = 0.0;
39        }
40    }
41"#;
42
43static FC_LAYER_KERNEL_BWD: &'static str = r#"
44    __kernel void fc_layer_grad(
45                __private int const batch_size,
46                __private int const prev_shape,
47                __private int const next_shape,
48                __private int const self_shape,
49                __global const float *self_out,
50                __global const float *next_grad,
51                __global const float *next_ws,
52                __global const float *prev_out,
53                __global float *neu_grad,
54                __global float *ws_grad)
55    {
56        uint const idx = get_global_id(0);
57
58        for (int i = 0; i < batch_size; ++i) {
59            __private float sum_err = 0.0;
60
61            for (int j = 0; j < next_shape; ++j) {
62                sum_err += next_grad[i * next_shape + j] * next_ws[self_shape * j + idx];
63            }
64
65            neu_grad[i * self_shape + idx] = sum_err * deriv(self_out[i * self_shape + idx]);
66        }
67
68        for (int i = 0; i < prev_shape; ++i) {
69            __private float avg_grad = 0.0;
70
71            for (int j = 0; j < batch_size; ++j) {
72                avg_grad += neu_grad[j * self_shape + idx] * prev_out[j * prev_shape + i];
73            }
74    
75            avg_grad = avg_grad / batch_size;
76    
77            ws_grad[idx * prev_shape + i] = avg_grad;
78        }
79    }
80"#;
81
82pub struct FcLayerOcl {
83    cpu_params: CpuParams,
84    ocl_params: OclParams,
85    size: usize,
86    batch_size: usize,
87
88    dropout: f32,
89    rng: ThreadRng,
90
91    ocl_kernel: Option<Kernel>,
92    ocl_kernel_grad: Option<Kernel>,
93    ocl_queue: Option<Queue>,
94    ocl_act_func: OclActivationFunc,
95}
96
97impl FcLayerOcl {
98    pub fn new(size: usize, act: OclActivationFunc) -> Self {
99        Self {
100            cpu_params: CpuParams::empty(),
101            ocl_params: OclParams::empty(),
102            size,
103            batch_size: 1,
104            ocl_kernel: None,
105            ocl_queue: None,
106            ocl_kernel_grad: None,
107            ocl_act_func: act,
108            dropout: 0.0,
109            rng: thread_rng(),
110        }
111    }
112
113    /// Must be set before init_ocl() was called
114    pub fn set_activation_function(&mut self, act: OclActivationFunc) {
115        if self.ocl_kernel.is_some() {
116            warn!("Setting ocl activation function, while kernel is already built");
117        }
118
119        self.ocl_act_func = act;
120    }
121
122    pub fn set_dropout(&mut self, dropout: f32) {
123        self.dropout = dropout;
124    }
125
126    pub fn dropout(&self) -> f32 {
127        self.dropout
128    }
129}
130
131impl AbstractLayer for FcLayerOcl {
132    fn layer_type(&self) -> &str {
133        "FcLayerOcl"
134    }
135
136    fn size(&self) -> usize {
137        self.size
138    }
139
140    fn cpu_params(&self) -> Option<CpuParams> {
141        None
142    }
143
144    fn set_cpu_params(&mut self, lp: CpuParams) {}
145
146    fn trainable_bufs(&self) -> TrainableBufsIds {
147        (
148            &[TypeBuffer::Weights as i32, TypeBuffer::Bias as i32],
149            &[TypeBuffer::WeightsGrad as i32, TypeBuffer::NeuGrad as i32],
150        )
151    }
152
153    fn set_input_shape(&mut self, sh: &[usize]) {
154        let kern = self.ocl_kernel.as_mut().unwrap();
155        let kern_grad = self.ocl_kernel_grad.as_mut().unwrap();
156        kern.set_arg("prev_shape", sh[0] as i32)
157            .expect("[fc_ocl] Failed to set prev_shape arg");
158        kern_grad
159            .set_arg("prev_shape", sh[0] as i32)
160            .expect("[fc_ocl] Failed to set prev_shape arg");
161
162        let queue = self.ocl_queue.as_ref().unwrap();
163        // buffer routine
164        self.ocl_params =
165            init_ocl_params(queue.clone(), self.size, sh, true).expect("Buffer create failure");
166    }
167
168    fn set_batch_size(&mut self, batch_size: usize) {
169        debug!("[fc_ocl] set batch size {}", batch_size);
170
171        self.batch_size = batch_size;
172
173        self.ocl_kernel
174            .as_mut()
175            .unwrap()
176            .set_default_global_work_size(ocl::SpatialDims::One(self.size * self.batch_size));
177
178        self.ocl_kernel
179            .as_mut()
180            .unwrap()
181            .set_arg("batch_size", batch_size as i32)
182            .expect("[fc_ocl] Failed to set batch_size arg");
183        self.ocl_kernel_grad
184            .as_mut()
185            .unwrap()
186            .set_arg("batch_size", batch_size as i32)
187            .expect("[fc_ocl] Failed to set batch_size arg");
188
189        self.ocl_params
190            .fit_to_batch_size_ocl(
191                self.size,
192                batch_size,
193                self.ocl_queue.as_ref().unwrap().clone(),
194            )
195            .expect("Fit to batch size ocl failed");
196    }
197
198    // Do copy layer memory(ws, output, ...)
199    fn copy_layer(&self) -> Box<dyn AbstractLayer> {
200        panic!("Do not copy OCL layers !");
201    }
202
203    // Do copy only Rc
204    fn clone_layer(&self) -> Box<dyn AbstractLayer> {
205        panic!("Do not copy OCL layers !");
206    }
207}
208
209impl AbstractLayerOcl for FcLayerOcl {
210    fn init_ocl(
211        &mut self,
212        ocl_ctx: &Context,
213        device: Device,
214        queue: Queue,
215    ) -> Result<(), Box<dyn Error>> {
216        let fwd_act = match self.ocl_act_func {
217            OclActivationFunc::Sigmoid => OCL_ACTIVATION_SIGMOID,
218            OclActivationFunc::Tanh => OCL_ACTIVATION_TANH,
219            OclActivationFunc::ReLU => OCL_ACTIVATION_RELU,
220            OclActivationFunc::Raw => OCL_ACTIVATION_RAW,
221            OclActivationFunc::LeakyReLU => OCL_ACTIVATION_LEAKY_RELU,
222        };
223
224        let bwd_act = match self.ocl_act_func {
225            OclActivationFunc::Sigmoid => OCL_ACTIVATION_SIGMOID_DERIV,
226            OclActivationFunc::Tanh => OCL_ACTIVATION_TANH_DERIV,
227            OclActivationFunc::ReLU => OCL_ACTIVATION_RELU_DERIV,
228            OclActivationFunc::LeakyReLU => OCL_ACTIVATION_LEAKY_RELU_DERIV,
229            OclActivationFunc::Raw => OCL_ACTIVATION_RAW_DERIV,
230        };
231
232        let program_fwd = [fwd_act, FC_LAYER_KERNEL_FWD].join("\n");
233        let program_bwd = [bwd_act, FC_LAYER_KERNEL_BWD].join("\n");
234
235        let program = Program::builder()
236            .devices(device)
237            .src(program_fwd)
238            .build(&ocl_ctx)?;
239        let program_grad = Program::builder()
240            .devices(device)
241            .src(program_bwd)
242            .build(&ocl_ctx)?;
243
244        let kern = Kernel::builder()
245            .name("fc_layer_product")
246            .program(&program)
247            .queue(queue.clone())
248            .global_work_size(self.size * self.batch_size)
249            .arg_named("batch_size", self.batch_size as i32)
250            .arg_named("prev_shape", 0 as i32)
251            .arg_named("self_shape", self.size as i32)
252            .arg_named("dropout_idx", 0 as i32)
253            .arg_named("dropout_len", 0 as i32)
254            .arg_named("in", None::<&Buffer<f32>>)
255            .arg_named("bias", None::<&Buffer<f32>>)
256            .arg_named("ws", None::<&Buffer<f32>>)
257            .arg_named("out", None::<&Buffer<f32>>)
258            .build()?;
259
260        let kern_grad = Kernel::builder()
261            .name("fc_layer_grad")
262            .program(&program_grad)
263            .queue(queue.clone())
264            .global_work_size(self.size)
265            .arg_named("batch_size", self.batch_size as i32)
266            .arg_named("prev_shape", 0 as i32)
267            .arg_named("next_shape", 0 as i32)
268            .arg_named("self_shape", self.size as i32)
269            .arg_named("self_out", None::<&Buffer<f32>>)
270            .arg_named("next_grad", None::<&Buffer<f32>>)
271            .arg_named("next_ws", None::<&Buffer<f32>>)
272            .arg_named("prev_out", None::<&Buffer<f32>>)
273            .arg_named("neu_grad", None::<&Buffer<f32>>)
274            .arg_named("ws_grad", None::<&Buffer<f32>>)
275            .build()?;
276
277        self.ocl_kernel = Some(kern);
278        self.ocl_kernel_grad = Some(kern_grad);
279        self.ocl_queue = Some(queue);
280
281        Ok(())
282    }
283
284    fn forward_ocl(&mut self, params: OclParamsBlob) -> LayerOclResult {
285        let prev_params = params.first().unwrap();
286        let prev_output = prev_params.get_buf_t(TypeBuffer::Output);
287        let prev_output = prev_output.0.borrow();
288
289        let self_ws = self.ocl_params.get_buf_t(TypeBuffer::Weights);
290        let self_ws = self_ws.0.borrow();
291
292        let self_output = self.ocl_params.get_buf_t(TypeBuffer::Output);
293        let self_output = self_output.0.borrow();
294
295        let self_bias = self.ocl_params.get_buf_t(TypeBuffer::Bias);
296        let self_bias = self_bias.0.borrow();
297
298        let self_kern = self.ocl_kernel.as_ref().unwrap();
299
300        // dropout
301        let dropout_len = (self.size as f32 * self.dropout) as i32;
302        let dropout_idx = self.rng.gen_range(0, self.size - dropout_len as usize);
303
304        self_kern
305            .set_arg("in", &*prev_output)
306            .expect("[fc_ocl] Setting param IN failure");
307        self_kern
308            .set_arg("bias", &*self_bias)
309            .expect("[fc_ocl] Failed to set BIAS param");
310        self_kern
311            .set_arg("ws", &*self_ws)
312            .expect("[fc_ocl] Setting param WS failure");
313        self_kern
314            .set_arg("out", &*self_output)
315            .expect("[fc_ocl] Setting param OUT failure");
316        self_kern
317            .set_arg("dropout_idx", dropout_idx as i32)
318            .expect("[fc_ocl] Failed to set DROPOUT_IDX");
319        self_kern
320            .set_arg("dropout_len", dropout_len as i32)
321            .expect("[fc_ocl] Failed to set DROPOUT LEN");
322
323        unsafe {
324            self_kern
325                .enq()
326                .expect("[fc_ocl] Enqueue forward kernel failure");
327        }
328
329        debug!("[fc_ocl] forward");
330
331        Ok(vec![self.ocl_params.clone()])
332    }
333
334    fn backward_ocl(
335        &mut self,
336        prev_input: OclParamsBlob,
337        next_input: OclParamsBlob,
338    ) -> LayerOclResult {
339        let self_out = self.ocl_params.get_buf_t(TypeBuffer::Output);
340        let self_out = self_out.0.borrow();
341
342        let self_neu_grad = self.ocl_params.get_buf_t(TypeBuffer::NeuGrad);
343        let self_neu_grad = self_neu_grad.0.borrow();
344
345        let self_ws_grad = self.ocl_params.get_buf_t(TypeBuffer::WeightsGrad);
346        let self_ws_grad = self_ws_grad.0.borrow_mut();
347
348        let prev_out = prev_input.first().unwrap().get_buf_t(TypeBuffer::Output);
349        let prev_out = prev_out.0.borrow();
350
351        let next_ws = next_input.first().unwrap().get_buf_t(TypeBuffer::Weights);
352        let next_ws = next_ws.0.borrow();
353
354        let next_grad = next_input.first().unwrap().get_buf_t(TypeBuffer::NeuGrad);
355        let next_grad = next_grad.0.borrow();
356
357        let next_shape = next_grad.len() / self.batch_size;
358
359        let self_kern = self.ocl_kernel_grad.as_ref().unwrap();
360
361        self_kern
362            .set_arg("self_out", &*self_out)
363            .expect("[fc_ocl] Setting param SELF_OUT failure");
364        self_kern
365            .set_arg("neu_grad", &*self_neu_grad)
366            .expect("[fc_ocl] Setting param NEU_GRAD failure");
367        self_kern
368            .set_arg("ws_grad", &*self_ws_grad)
369            .expect("[fc_ocl] Setting param WS_GRAD failure");
370        self_kern
371            .set_arg("prev_out", &*prev_out)
372            .expect("[fc_ocl] Setting param PREV_OUT failure");
373        self_kern
374            .set_arg("next_ws", &*next_ws)
375            .expect("[fc_ocl] Setting param NEXT_WS failure");
376        self_kern
377            .set_arg("next_grad", &*next_grad)
378            .expect("[fc_ocl] Setting param NEXT_GRAD failure");
379
380        self_kern
381            .set_arg("next_shape", next_shape as i32)
382            .expect("[fc_ocl] Setting param NEXT_SHAPE failure");
383
384        unsafe {
385            self_kern
386                .enq()
387                .expect("[fc_ocl] Enqueue backward kernel failure");
388        }
389
390        debug!("[fc_ocl] backward done");
391
392        Ok(vec![self.ocl_params.clone()])
393    }
394
395    fn ocl_params(&self) -> Option<OclParams> {
396        Some(self.ocl_params.clone())
397    }
398
399    fn set_ocl_params(&mut self, params: OclParams) {
400        self.ocl_params = params;
401    }
402
403    fn copy_layer_ocl(&self) -> Box<dyn AbstractLayerOcl> {
404        todo!()
405    }
406
407    fn clone_layer_ocl(&self) -> Box<dyn AbstractLayerOcl> {
408        Box::new(self.clone())
409    }
410}
411
412impl Default for FcLayerOcl {
413    fn default() -> Self {
414        Self {
415            cpu_params: CpuParams::empty(),
416            ocl_params: OclParams::empty(),
417            size: 0,
418            batch_size: 1,
419            ocl_kernel: None,
420            ocl_kernel_grad: None,
421            ocl_queue: None,
422            ocl_act_func: OclActivationFunc::Sigmoid,
423
424            dropout: 0.0,
425            rng: thread_rng(),
426        }
427    }
428}
429
430impl Clone for FcLayerOcl {
431    fn clone(&self) -> Self {
432        let queue = self.ocl_queue.as_ref().unwrap();
433
434        Self {
435            cpu_params: self.cpu_params.clone(),
436            ocl_params: self.ocl_params.clone(),
437            size: self.size,
438            batch_size: self.batch_size,
439            ocl_kernel: None,
440            ocl_kernel_grad: None,
441            ocl_queue: Some(queue.clone()),
442            ocl_act_func: self.ocl_act_func.clone(),
443
444            dropout: 0.0,
445            rng: thread_rng(),
446        }
447    }
448
449    fn clone_from(&mut self, _source: &Self) {
450        todo!()
451    }
452}
453
454impl WithParams for FcLayerOcl {
455    fn cfg(&self) -> HashMap<String, Variant> {
456        let mut out = HashMap::new();
457
458        out.insert("size".to_string(), Variant::Int(self.size as i32));
459        out.insert(
460            "activation".to_string(),
461            Variant::String(self.ocl_act_func.to_string()),
462        );
463
464        out
465    }
466
467    fn set_cfg(&mut self, args: &HashMap<String, Variant>) {
468        if let Some(size) = args.get("size") {
469            if let Variant::Int(size) = size {
470                self.size = *size as usize;
471            }
472        }
473
474        if let Some(act_f) = args.get("activation") {
475            if let Variant::String(act_f) = act_f {
476                let cvt_res = OclActivationFunc::try_from(act_f.as_str());
477
478                if let Ok(cvt) = cvt_res {
479                    self.ocl_act_func = cvt;
480                }
481            }
482        }
483    }
484}