1use crate::layers::*;
2use crate::cpu_params::*;
3use crate::ocl::*;
4use crate::util::*;
5
6use log::{debug, warn};
7
8use rand::{thread_rng, Rng, ThreadRng};
9
10use ocl::{Buffer, Context, Device, Kernel, MemFlags, Program, Queue};
11use std::{collections::HashMap, error::Error};
12
13static FC_LAYER_KERNEL_FWD: &'static str = r#"
14 __kernel void fc_layer_product(
15 __private int const batch_size,
16 __private int const prev_shape,
17 __private int const self_shape,
18 __private int const dropout_idx,
19 __private int const dropout_len,
20 __global const float *in,
21 __global const float *bias,
22 __global const float *ws,
23 __global float *out)
24 {
25 uint const idx = get_global_id(0);
26 __private uint const real_idx = idx % self_shape;
27 __private uint const batch_idx = idx / self_shape;
28
29 __private float sum = 0.0;
30
31 for (int j = 0; j < prev_shape; ++j) {
32 sum += ws[real_idx * prev_shape + j] * in[j + prev_shape * batch_idx];
33 }
34
35 out[idx] = activation(sum + bias[real_idx]);
36
37 if (real_idx >= dropout_idx && real_idx < dropout_idx + dropout_len) {
38 out[idx] = 0.0;
39 }
40 }
41"#;
42
43static FC_LAYER_KERNEL_BWD: &'static str = r#"
44 __kernel void fc_layer_grad(
45 __private int const batch_size,
46 __private int const prev_shape,
47 __private int const next_shape,
48 __private int const self_shape,
49 __global const float *self_out,
50 __global const float *next_grad,
51 __global const float *next_ws,
52 __global const float *prev_out,
53 __global float *neu_grad,
54 __global float *ws_grad)
55 {
56 uint const idx = get_global_id(0);
57
58 for (int i = 0; i < batch_size; ++i) {
59 __private float sum_err = 0.0;
60
61 for (int j = 0; j < next_shape; ++j) {
62 sum_err += next_grad[i * next_shape + j] * next_ws[self_shape * j + idx];
63 }
64
65 neu_grad[i * self_shape + idx] = sum_err * deriv(self_out[i * self_shape + idx]);
66 }
67
68 for (int i = 0; i < prev_shape; ++i) {
69 __private float avg_grad = 0.0;
70
71 for (int j = 0; j < batch_size; ++j) {
72 avg_grad += neu_grad[j * self_shape + idx] * prev_out[j * prev_shape + i];
73 }
74
75 avg_grad = avg_grad / batch_size;
76
77 ws_grad[idx * prev_shape + i] = avg_grad;
78 }
79 }
80"#;
81
82pub struct FcLayerOcl {
83 cpu_params: CpuParams,
84 ocl_params: OclParams,
85 size: usize,
86 batch_size: usize,
87
88 dropout: f32,
89 rng: ThreadRng,
90
91 ocl_kernel: Option<Kernel>,
92 ocl_kernel_grad: Option<Kernel>,
93 ocl_queue: Option<Queue>,
94 ocl_act_func: OclActivationFunc,
95}
96
97impl FcLayerOcl {
98 pub fn new(size: usize, act: OclActivationFunc) -> Self {
99 Self {
100 cpu_params: CpuParams::empty(),
101 ocl_params: OclParams::empty(),
102 size,
103 batch_size: 1,
104 ocl_kernel: None,
105 ocl_queue: None,
106 ocl_kernel_grad: None,
107 ocl_act_func: act,
108 dropout: 0.0,
109 rng: thread_rng(),
110 }
111 }
112
113 pub fn set_activation_function(&mut self, act: OclActivationFunc) {
115 if self.ocl_kernel.is_some() {
116 warn!("Setting ocl activation function, while kernel is already built");
117 }
118
119 self.ocl_act_func = act;
120 }
121
122 pub fn set_dropout(&mut self, dropout: f32) {
123 self.dropout = dropout;
124 }
125
126 pub fn dropout(&self) -> f32 {
127 self.dropout
128 }
129}
130
131impl AbstractLayer for FcLayerOcl {
132 fn layer_type(&self) -> &str {
133 "FcLayerOcl"
134 }
135
136 fn size(&self) -> usize {
137 self.size
138 }
139
140 fn cpu_params(&self) -> Option<CpuParams> {
141 None
142 }
143
144 fn set_cpu_params(&mut self, lp: CpuParams) {}
145
146 fn trainable_bufs(&self) -> TrainableBufsIds {
147 (
148 &[TypeBuffer::Weights as i32, TypeBuffer::Bias as i32],
149 &[TypeBuffer::WeightsGrad as i32, TypeBuffer::NeuGrad as i32],
150 )
151 }
152
153 fn set_input_shape(&mut self, sh: &[usize]) {
154 let kern = self.ocl_kernel.as_mut().unwrap();
155 let kern_grad = self.ocl_kernel_grad.as_mut().unwrap();
156 kern.set_arg("prev_shape", sh[0] as i32)
157 .expect("[fc_ocl] Failed to set prev_shape arg");
158 kern_grad
159 .set_arg("prev_shape", sh[0] as i32)
160 .expect("[fc_ocl] Failed to set prev_shape arg");
161
162 let queue = self.ocl_queue.as_ref().unwrap();
163 self.ocl_params =
165 init_ocl_params(queue.clone(), self.size, sh, true).expect("Buffer create failure");
166 }
167
168 fn set_batch_size(&mut self, batch_size: usize) {
169 debug!("[fc_ocl] set batch size {}", batch_size);
170
171 self.batch_size = batch_size;
172
173 self.ocl_kernel
174 .as_mut()
175 .unwrap()
176 .set_default_global_work_size(ocl::SpatialDims::One(self.size * self.batch_size));
177
178 self.ocl_kernel
179 .as_mut()
180 .unwrap()
181 .set_arg("batch_size", batch_size as i32)
182 .expect("[fc_ocl] Failed to set batch_size arg");
183 self.ocl_kernel_grad
184 .as_mut()
185 .unwrap()
186 .set_arg("batch_size", batch_size as i32)
187 .expect("[fc_ocl] Failed to set batch_size arg");
188
189 self.ocl_params
190 .fit_to_batch_size_ocl(
191 self.size,
192 batch_size,
193 self.ocl_queue.as_ref().unwrap().clone(),
194 )
195 .expect("Fit to batch size ocl failed");
196 }
197
198 fn copy_layer(&self) -> Box<dyn AbstractLayer> {
200 panic!("Do not copy OCL layers !");
201 }
202
203 fn clone_layer(&self) -> Box<dyn AbstractLayer> {
205 panic!("Do not copy OCL layers !");
206 }
207}
208
209impl AbstractLayerOcl for FcLayerOcl {
210 fn init_ocl(
211 &mut self,
212 ocl_ctx: &Context,
213 device: Device,
214 queue: Queue,
215 ) -> Result<(), Box<dyn Error>> {
216 let fwd_act = match self.ocl_act_func {
217 OclActivationFunc::Sigmoid => OCL_ACTIVATION_SIGMOID,
218 OclActivationFunc::Tanh => OCL_ACTIVATION_TANH,
219 OclActivationFunc::ReLU => OCL_ACTIVATION_RELU,
220 OclActivationFunc::Raw => OCL_ACTIVATION_RAW,
221 OclActivationFunc::LeakyReLU => OCL_ACTIVATION_LEAKY_RELU,
222 };
223
224 let bwd_act = match self.ocl_act_func {
225 OclActivationFunc::Sigmoid => OCL_ACTIVATION_SIGMOID_DERIV,
226 OclActivationFunc::Tanh => OCL_ACTIVATION_TANH_DERIV,
227 OclActivationFunc::ReLU => OCL_ACTIVATION_RELU_DERIV,
228 OclActivationFunc::LeakyReLU => OCL_ACTIVATION_LEAKY_RELU_DERIV,
229 OclActivationFunc::Raw => OCL_ACTIVATION_RAW_DERIV,
230 };
231
232 let program_fwd = [fwd_act, FC_LAYER_KERNEL_FWD].join("\n");
233 let program_bwd = [bwd_act, FC_LAYER_KERNEL_BWD].join("\n");
234
235 let program = Program::builder()
236 .devices(device)
237 .src(program_fwd)
238 .build(&ocl_ctx)?;
239 let program_grad = Program::builder()
240 .devices(device)
241 .src(program_bwd)
242 .build(&ocl_ctx)?;
243
244 let kern = Kernel::builder()
245 .name("fc_layer_product")
246 .program(&program)
247 .queue(queue.clone())
248 .global_work_size(self.size * self.batch_size)
249 .arg_named("batch_size", self.batch_size as i32)
250 .arg_named("prev_shape", 0 as i32)
251 .arg_named("self_shape", self.size as i32)
252 .arg_named("dropout_idx", 0 as i32)
253 .arg_named("dropout_len", 0 as i32)
254 .arg_named("in", None::<&Buffer<f32>>)
255 .arg_named("bias", None::<&Buffer<f32>>)
256 .arg_named("ws", None::<&Buffer<f32>>)
257 .arg_named("out", None::<&Buffer<f32>>)
258 .build()?;
259
260 let kern_grad = Kernel::builder()
261 .name("fc_layer_grad")
262 .program(&program_grad)
263 .queue(queue.clone())
264 .global_work_size(self.size)
265 .arg_named("batch_size", self.batch_size as i32)
266 .arg_named("prev_shape", 0 as i32)
267 .arg_named("next_shape", 0 as i32)
268 .arg_named("self_shape", self.size as i32)
269 .arg_named("self_out", None::<&Buffer<f32>>)
270 .arg_named("next_grad", None::<&Buffer<f32>>)
271 .arg_named("next_ws", None::<&Buffer<f32>>)
272 .arg_named("prev_out", None::<&Buffer<f32>>)
273 .arg_named("neu_grad", None::<&Buffer<f32>>)
274 .arg_named("ws_grad", None::<&Buffer<f32>>)
275 .build()?;
276
277 self.ocl_kernel = Some(kern);
278 self.ocl_kernel_grad = Some(kern_grad);
279 self.ocl_queue = Some(queue);
280
281 Ok(())
282 }
283
284 fn forward_ocl(&mut self, params: OclParamsBlob) -> LayerOclResult {
285 let prev_params = params.first().unwrap();
286 let prev_output = prev_params.get_buf_t(TypeBuffer::Output);
287 let prev_output = prev_output.0.borrow();
288
289 let self_ws = self.ocl_params.get_buf_t(TypeBuffer::Weights);
290 let self_ws = self_ws.0.borrow();
291
292 let self_output = self.ocl_params.get_buf_t(TypeBuffer::Output);
293 let self_output = self_output.0.borrow();
294
295 let self_bias = self.ocl_params.get_buf_t(TypeBuffer::Bias);
296 let self_bias = self_bias.0.borrow();
297
298 let self_kern = self.ocl_kernel.as_ref().unwrap();
299
300 let dropout_len = (self.size as f32 * self.dropout) as i32;
302 let dropout_idx = self.rng.gen_range(0, self.size - dropout_len as usize);
303
304 self_kern
305 .set_arg("in", &*prev_output)
306 .expect("[fc_ocl] Setting param IN failure");
307 self_kern
308 .set_arg("bias", &*self_bias)
309 .expect("[fc_ocl] Failed to set BIAS param");
310 self_kern
311 .set_arg("ws", &*self_ws)
312 .expect("[fc_ocl] Setting param WS failure");
313 self_kern
314 .set_arg("out", &*self_output)
315 .expect("[fc_ocl] Setting param OUT failure");
316 self_kern
317 .set_arg("dropout_idx", dropout_idx as i32)
318 .expect("[fc_ocl] Failed to set DROPOUT_IDX");
319 self_kern
320 .set_arg("dropout_len", dropout_len as i32)
321 .expect("[fc_ocl] Failed to set DROPOUT LEN");
322
323 unsafe {
324 self_kern
325 .enq()
326 .expect("[fc_ocl] Enqueue forward kernel failure");
327 }
328
329 debug!("[fc_ocl] forward");
330
331 Ok(vec![self.ocl_params.clone()])
332 }
333
334 fn backward_ocl(
335 &mut self,
336 prev_input: OclParamsBlob,
337 next_input: OclParamsBlob,
338 ) -> LayerOclResult {
339 let self_out = self.ocl_params.get_buf_t(TypeBuffer::Output);
340 let self_out = self_out.0.borrow();
341
342 let self_neu_grad = self.ocl_params.get_buf_t(TypeBuffer::NeuGrad);
343 let self_neu_grad = self_neu_grad.0.borrow();
344
345 let self_ws_grad = self.ocl_params.get_buf_t(TypeBuffer::WeightsGrad);
346 let self_ws_grad = self_ws_grad.0.borrow_mut();
347
348 let prev_out = prev_input.first().unwrap().get_buf_t(TypeBuffer::Output);
349 let prev_out = prev_out.0.borrow();
350
351 let next_ws = next_input.first().unwrap().get_buf_t(TypeBuffer::Weights);
352 let next_ws = next_ws.0.borrow();
353
354 let next_grad = next_input.first().unwrap().get_buf_t(TypeBuffer::NeuGrad);
355 let next_grad = next_grad.0.borrow();
356
357 let next_shape = next_grad.len() / self.batch_size;
358
359 let self_kern = self.ocl_kernel_grad.as_ref().unwrap();
360
361 self_kern
362 .set_arg("self_out", &*self_out)
363 .expect("[fc_ocl] Setting param SELF_OUT failure");
364 self_kern
365 .set_arg("neu_grad", &*self_neu_grad)
366 .expect("[fc_ocl] Setting param NEU_GRAD failure");
367 self_kern
368 .set_arg("ws_grad", &*self_ws_grad)
369 .expect("[fc_ocl] Setting param WS_GRAD failure");
370 self_kern
371 .set_arg("prev_out", &*prev_out)
372 .expect("[fc_ocl] Setting param PREV_OUT failure");
373 self_kern
374 .set_arg("next_ws", &*next_ws)
375 .expect("[fc_ocl] Setting param NEXT_WS failure");
376 self_kern
377 .set_arg("next_grad", &*next_grad)
378 .expect("[fc_ocl] Setting param NEXT_GRAD failure");
379
380 self_kern
381 .set_arg("next_shape", next_shape as i32)
382 .expect("[fc_ocl] Setting param NEXT_SHAPE failure");
383
384 unsafe {
385 self_kern
386 .enq()
387 .expect("[fc_ocl] Enqueue backward kernel failure");
388 }
389
390 debug!("[fc_ocl] backward done");
391
392 Ok(vec![self.ocl_params.clone()])
393 }
394
395 fn ocl_params(&self) -> Option<OclParams> {
396 Some(self.ocl_params.clone())
397 }
398
399 fn set_ocl_params(&mut self, params: OclParams) {
400 self.ocl_params = params;
401 }
402
403 fn copy_layer_ocl(&self) -> Box<dyn AbstractLayerOcl> {
404 todo!()
405 }
406
407 fn clone_layer_ocl(&self) -> Box<dyn AbstractLayerOcl> {
408 Box::new(self.clone())
409 }
410}
411
412impl Default for FcLayerOcl {
413 fn default() -> Self {
414 Self {
415 cpu_params: CpuParams::empty(),
416 ocl_params: OclParams::empty(),
417 size: 0,
418 batch_size: 1,
419 ocl_kernel: None,
420 ocl_kernel_grad: None,
421 ocl_queue: None,
422 ocl_act_func: OclActivationFunc::Sigmoid,
423
424 dropout: 0.0,
425 rng: thread_rng(),
426 }
427 }
428}
429
430impl Clone for FcLayerOcl {
431 fn clone(&self) -> Self {
432 let queue = self.ocl_queue.as_ref().unwrap();
433
434 Self {
435 cpu_params: self.cpu_params.clone(),
436 ocl_params: self.ocl_params.clone(),
437 size: self.size,
438 batch_size: self.batch_size,
439 ocl_kernel: None,
440 ocl_kernel_grad: None,
441 ocl_queue: Some(queue.clone()),
442 ocl_act_func: self.ocl_act_func.clone(),
443
444 dropout: 0.0,
445 rng: thread_rng(),
446 }
447 }
448
449 fn clone_from(&mut self, _source: &Self) {
450 todo!()
451 }
452}
453
454impl WithParams for FcLayerOcl {
455 fn cfg(&self) -> HashMap<String, Variant> {
456 let mut out = HashMap::new();
457
458 out.insert("size".to_string(), Variant::Int(self.size as i32));
459 out.insert(
460 "activation".to_string(),
461 Variant::String(self.ocl_act_func.to_string()),
462 );
463
464 out
465 }
466
467 fn set_cfg(&mut self, args: &HashMap<String, Variant>) {
468 if let Some(size) = args.get("size") {
469 if let Variant::Int(size) = size {
470 self.size = *size as usize;
471 }
472 }
473
474 if let Some(act_f) = args.get("activation") {
475 if let Variant::String(act_f) = act_f {
476 let cvt_res = OclActivationFunc::try_from(act_f.as_str());
477
478 if let Ok(cvt) = cvt_res {
479 self.ocl_act_func = cvt;
480 }
481 }
482 }
483 }
484}