use opencl3::{
device::cl_float,
error_codes::cl_int,
kernel::ExecuteKernel,
memory::{Buffer, ClMem, CL_MEM_READ_ONLY, CL_MEM_READ_WRITE},
};
use rayon::iter::{IntoParallelIterator, IntoParallelRefIterator, ParallelIterator};
use savefile_derive::Savefile;
use std::{mem, ptr, collections::HashMap};
use crate::utils::opencl::{BufferLike, InplaceBufferOperations};
#[allow(unused_imports)]
use crate::{
optimizers::Optimizer,
types::{ModelLayer, SyncDataError},
utils::{
opencl::{empty_buffer, ensure_program, EnsureKernelsAndProgramError},
BufferOperations, OpenCLState,
},
};
use super::{
compute_update_vectors, Gradient, Layer, LayerGradientApplicationError,
LayerGradientComputationError, LayerInitializationError, LayerLossToInputDifferentiationError,
LayerPropagationError, ParametersOptimizationError,
initializers::{Initializer, InitializerTrait, GlorotUniformInitializer, ConstantInitializer},
};
const DENSE_PROP_PROGRAM_NAME: &str = "DENSE_PROPAGATION";
const DENSE_BACKPROP_PROGRAM_NAME: &str = "DENSE_BACKPROPAGATION";
const PROPAGATION_PROGRAM_SORUCE: &str = include_str!("kernels/dense_propagation.cl");
const BACK_PROPAGATION_PROGRAM_SOURCE: &str = include_str!("kernels/dense_back_propagation.cl");
const PROPAGATION_KERNEL_NAME: &str = "dense_propagate";
const WEIGHTS_GRADIENT_COMPUTATION_KERNEL_NAME: &str = "weights_gradient_calculation";
const BIAS_GRADIENT_COMPUTATION_KERNEL_NAME: &str = "bias_gradient_calculation";
const LOSS_TO_INPUT_DIFFERENTIATION_KERNEL_NAME: &str =
"compute_loss_derivative_with_respect_to_inputs";
pub(crate) fn compile_dense(
opencl_state: &mut OpenCLState,
) -> Result<(), EnsureKernelsAndProgramError> {
let prop_kernels = &[PROPAGATION_KERNEL_NAME.to_string()];
let backprop_kernels = &[
WEIGHTS_GRADIENT_COMPUTATION_KERNEL_NAME.to_string(),
BIAS_GRADIENT_COMPUTATION_KERNEL_NAME.to_string(),
LOSS_TO_INPUT_DIFFERENTIATION_KERNEL_NAME.to_string(),
];
ensure_program(
opencl_state,
DENSE_PROP_PROGRAM_NAME.to_string(),
PROPAGATION_PROGRAM_SORUCE.to_string(),
"".to_string(),
prop_kernels,
)?;
ensure_program(
opencl_state,
DENSE_BACKPROP_PROGRAM_NAME.to_string(),
BACK_PROPAGATION_PROGRAM_SOURCE.to_string(),
"".to_string(),
backprop_kernels,
)?;
Ok(())
}
#[derive(Debug, Savefile)]
pub struct Dense<'a> {
pub inputs_amount: usize,
pub outputs_amount: usize,
pub weights: Vec<Vec<f32>>,
pub biases: Vec<f32>,
pub initializers: HashMap<String, Initializer>,
#[savefile_ignore]
#[savefile_introspect_ignore]
pub weights_buffer: Option<Buffer<cl_float>>,
#[savefile_ignore]
#[savefile_introspect_ignore]
pub biases_buffer: Option<Buffer<cl_float>>,
#[savefile_ignore]
#[savefile_introspect_ignore]
pub last_inputs_buffer: Option<Buffer<cl_float>>,
#[savefile_ignore]
#[savefile_introspect_ignore]
pub last_outputs_buffer: Option<Buffer<cl_float>>,
#[savefile_ignore]
#[savefile_introspect_ignore]
opencl_state: Option<&'a OpenCLState>,
}
impl<'a> Dense<'a> {
pub fn new_raw(inputs_amount: usize, outputs_amount: usize) -> Dense<'a> {
let mut initializers = HashMap::with_capacity(2);
initializers.insert("weights".to_string(), GlorotUniformInitializer::new().into());
initializers.insert("biases".to_string(), ConstantInitializer::new(0.0).into());
Dense {
inputs_amount,
outputs_amount,
initializers,
weights: Vec::default(),
biases: Vec::default(),
weights_buffer: None,
biases_buffer: None,
last_inputs_buffer: None,
last_outputs_buffer: None,
opencl_state: None,
}
.into() }
pub fn new(inputs_amount: usize, outputs_amount: usize) -> ModelLayer<'a> {
Self::new_raw(inputs_amount, outputs_amount).into()
}
}
impl<'a> Layer<'a> for Dense<'a> {
fn get_initializer_for_parameter<'b>(&'b self, parameter: &str) -> Option<&'b Initializer> {
self.initializers.get(parameter)
}
fn get_flattened_parameter_data(&self, parameter: &str) -> Option<Vec<f32>> {
match parameter {
"weights" => {
Some(self.weights.par_iter().flatten().map(|x| *x).collect())
},
"biases" => {
Some(self.biases.to_vec())
},
_ => {
None
}
}
}
fn set_initializer_for_parameter(
mut self,
initializer: Initializer,
parameter: &str,
) -> ModelLayer<'a> {
self.initializers.insert(parameter.to_string(), initializer);
self.into()
}
fn get_last_inputs(&self) -> Option<&Buffer<cl_float>> {
self.last_inputs_buffer.as_ref()
}
fn get_last_outputs(&self) -> Option<&Buffer<cl_float>> {
self.last_outputs_buffer.as_ref()
}
fn get_inputs_amount(&self) -> usize {
self.inputs_amount
}
fn get_outputs_amount(&self) -> usize {
self.outputs_amount
}
fn clean_up_gpu_state(&mut self) -> () {
if self.weights_buffer.is_some() {
drop(self.weights_buffer.as_ref().unwrap());
}
if self.biases_buffer.is_some() {
drop(self.biases_buffer.as_ref().unwrap());
}
if self.last_inputs_buffer.is_some() {
drop(self.last_inputs_buffer.as_ref().unwrap());
}
if self.last_outputs_buffer.is_some() {
drop(self.last_outputs_buffer.as_ref().unwrap());
}
}
fn sync_data_from_buffers_to_host(&mut self) -> Result<(), SyncDataError> {
if self.opencl_state.is_none() {
return Err(SyncDataError::NotInitialized);
}
let state = self.opencl_state.unwrap();
if state.queues.is_empty() {
return Err(SyncDataError::NoCommandQueue);
}
if self.weights_buffer.is_none() {
return Err(SyncDataError::NotAllocatedInDevice {
field_name: "weights_buffer".to_string(),
});
}
let weights_buffer = self.weights_buffer.as_ref().unwrap();
if self.biases_buffer.is_none() {
return Err(SyncDataError::NotAllocatedInDevice {
field_name: "biases_buffer".to_string(),
});
}
let biases_buffer = self.biases_buffer.as_ref().unwrap();
let weights_flat = Vec::<f32>::from_buffer(weights_buffer, false, state)?;
let biases = Vec::<f32>::from_buffer(biases_buffer, false, state)?;
self.biases = biases;
self.weights = (0..self.inputs_amount)
.into_par_iter()
.map(|i| {
let row_part = i * self.outputs_amount;
(0..self.outputs_amount)
.into_iter()
.map(|j| {
let flat_index = row_part + j;
weights_flat[flat_index]
})
.collect::<Vec<f32>>()
})
.collect::<Vec<Vec<f32>>>();
Ok(())
}
fn init(&mut self, opencl_state: &'a OpenCLState) -> Result<(), LayerInitializationError> {
if self.weights.is_empty() {
if let Some(initializer) = self.initializers.get("weights") {
self.weights = initializer.initialize_2d((self.inputs_amount, self.outputs_amount), self);
} else {
return Err(LayerInitializationError::MissingParameterInitializer("weights"));
}
}
if self.biases.is_empty() {
if let Some(initializer) = self.initializers.get("biases") {
self.biases = initializer.initialize_1d(self.outputs_amount, self);
} else {
return Err(LayerInitializationError::MissingParameterInitializer("biases"));
}
}
let weights_buffer = self
.weights
.par_iter()
.flatten()
.map(|x| *x)
.collect::<Vec<f32>>()
.to_buffer(false, opencl_state)?;
let biases_buffer = self.biases.to_buffer(false, opencl_state)?;
self.weights_buffer = Some(weights_buffer);
self.biases_buffer = Some(biases_buffer);
self.opencl_state = Some(opencl_state);
Ok(())
}
fn propagate(
&mut self,
input_samples: &Buffer<cl_float>,
) -> Result<&Buffer<cl_float>, LayerPropagationError> {
if self.opencl_state.is_none() {
return Err(LayerPropagationError::LayerNotInitialized);
}
let state = self.opencl_state.unwrap();
if state.queues.first().is_none() {
return Err(LayerPropagationError::NoCommandQueueFound);
}
let queue = state.queues.first().unwrap();
let context = &state.context;
let inputs_size = input_samples.size()?;
let inputs_total_count = inputs_size / mem::size_of::<cl_float>();
if inputs_total_count % self.inputs_amount != 0 {
return Err(LayerPropagationError::InputsDontMatchExpectedShape);
}
let mut copied_last_inputs_buffer = Buffer::<cl_float>::create(
context,
CL_MEM_READ_ONLY,
inputs_total_count,
ptr::null_mut(),
)?;
queue.enqueue_copy_buffer(
input_samples,
&mut copied_last_inputs_buffer,
0,
0,
inputs_size,
&[],
)?;
self.last_inputs_buffer = Some(copied_last_inputs_buffer);
let samples_amount = inputs_total_count / self.inputs_amount;
let outputs_buffer = empty_buffer(
self.outputs_amount * samples_amount,
CL_MEM_READ_WRITE,
state,
)?;
let program = state.get_prgm(DENSE_PROP_PROGRAM_NAME)?;
let kernel = program.get_krnl(PROPAGATION_KERNEL_NAME)?;
ExecuteKernel::new(kernel)
.set_arg(input_samples)
.set_arg(self.biases_buffer.as_ref().unwrap())
.set_arg(self.weights_buffer.as_ref().unwrap())
.set_arg(&outputs_buffer)
.set_arg(&(self.inputs_amount as cl_int))
.set_arg(&(samples_amount as cl_int))
.set_arg(&(self.outputs_amount as cl_int))
.set_global_work_sizes(&[samples_amount, self.outputs_amount])
.enqueue_nd_range(queue)?;
queue.finish()?;
self.last_outputs_buffer = Some(outputs_buffer);
Ok(self.last_outputs_buffer.as_ref().unwrap())
}
fn compute_gradients(
&self,
layer_output_to_error_derivative: &Buffer<cl_float>,
) -> Result<Vec<Gradient>, LayerGradientComputationError> {
if self.opencl_state.is_none() {
return Err(LayerGradientComputationError::LayerNotInitialized);
}
let state = self.opencl_state.unwrap();
if state.queues.first().is_none() {
return Err(LayerGradientComputationError::NoCommandQueueFound);
}
let queue = state.queues.first().unwrap();
if layer_output_to_error_derivative.size()? / mem::size_of::<cl_float>()
% self.outputs_amount
!= 0
{
return Err(LayerGradientComputationError::DerivativesDontMatchExpectedShape);
}
if self.last_inputs_buffer.is_none() {
return Err(LayerGradientComputationError::HasNotPropagatedBeforeCalculation);
}
let backprop_program = state.get_prgm(DENSE_BACKPROP_PROGRAM_NAME)?;
let weights_gradient_computation_kernel =
backprop_program.get_krnl(WEIGHTS_GRADIENT_COMPUTATION_KERNEL_NAME)?;
let bias_gradient_computation_kernel =
backprop_program.get_krnl(BIAS_GRADIENT_COMPUTATION_KERNEL_NAME)?;
let weights_gradients = empty_buffer(
self.inputs_amount * self.outputs_amount,
CL_MEM_READ_WRITE,
state,
)?;
let bias_gradients = empty_buffer(self.outputs_amount, CL_MEM_READ_WRITE, state)?;
let samples_amount = layer_output_to_error_derivative.size()?
/ self.outputs_amount
/ mem::size_of::<cl_float>();
let weights_event = ExecuteKernel::new(weights_gradient_computation_kernel)
.set_arg(layer_output_to_error_derivative)
.set_arg(self.last_inputs_buffer.as_ref().unwrap())
.set_arg(&weights_gradients)
.set_arg(&(samples_amount as cl_int))
.set_arg(&(self.outputs_amount as cl_int))
.set_arg(&(self.inputs_amount as cl_int))
.set_global_work_sizes(&[self.inputs_amount, self.outputs_amount])
.enqueue_nd_range(queue)?;
ExecuteKernel::new(bias_gradient_computation_kernel)
.set_arg(layer_output_to_error_derivative)
.set_arg(&bias_gradients)
.set_arg(&(samples_amount as cl_int))
.set_arg(&(self.outputs_amount as cl_int))
.set_global_work_size(self.outputs_amount)
.set_wait_event(&weights_event)
.enqueue_nd_range(queue)?;
queue.finish()?;
Ok(vec![
Gradient {
parameter_id: "weights".to_string(),
value: weights_gradients,
optimizable: true,
},
Gradient {
parameter_id: "biases".to_string(),
value: bias_gradients,
optimizable: true,
},
])
}
fn optimize_parameters(
&mut self,
optimizer: &dyn Optimizer<'a>,
layer_index: usize,
timestep: usize,
) -> Result<(), ParametersOptimizationError> {
if self.weights_buffer.is_none() {
return Err(ParametersOptimizationError::EmptyParameter(
"weights".to_string(),
));
}
if self.biases_buffer.is_none() {
return Err(ParametersOptimizationError::EmptyParameter(
"biases".to_string(),
));
}
optimizer.optimize_parameters(
self.weights_buffer.as_mut().unwrap(),
"weights".to_string(),
timestep,
layer_index,
)?;
optimizer.optimize_parameters(
self.biases_buffer.as_mut().unwrap(),
"biases".to_string(),
timestep,
layer_index,
)?;
Ok(())
}
fn apply_gradients(
&mut self,
per_parameter_type_gradients: &[Gradient],
optimizer: &mut dyn Optimizer<'a>,
layer_index: usize,
timestep: usize,
) -> Result<(), LayerGradientApplicationError> {
if self.opencl_state.is_none() {
return Err(LayerGradientApplicationError::LayerNotInitialized);
}
let state = self.opencl_state.unwrap();
if per_parameter_type_gradients.len() != 2 {
return Err(LayerGradientApplicationError::GradientsDontMatchExpectedShape);
}
let update_vectors = compute_update_vectors(
optimizer,
per_parameter_type_gradients,
layer_index,
timestep,
state
)?;
let weights_buffer = self.weights_buffer.as_mut().unwrap();
let biases_buffer = self.biases_buffer.as_mut().unwrap();
weights_buffer.subtract_inplc(&update_vectors[0], state)?;
biases_buffer.subtract_inplc(&update_vectors[1], state)?;
Ok(())
}
fn compute_loss_to_input_derivatives(
&self,
layer_output_to_error_derivative: &Buffer<cl_float>,
) -> Result<Buffer<cl_float>, LayerLossToInputDifferentiationError> {
if self.opencl_state.is_none() {
return Err(LayerLossToInputDifferentiationError::LayerNotInitialized);
}
let state = self.opencl_state.unwrap();
if state.queues.len() == 0 {
return Err(LayerLossToInputDifferentiationError::NoCommandQueueFound);
}
let queue = state.queues.first().unwrap();
let program = state.get_prgm(DENSE_BACKPROP_PROGRAM_NAME)?;
let kernel = program.get_krnl(LOSS_TO_INPUT_DIFFERENTIATION_KERNEL_NAME)?;
if layer_output_to_error_derivative.size()? % self.outputs_amount != 0 {
return Err(LayerLossToInputDifferentiationError::DerivativesDontMatchExpectedShape);
}
let samples_amount = layer_output_to_error_derivative.size()?
/ self.outputs_amount
/ mem::size_of::<cl_float>();
let loss_to_input_derivatives = empty_buffer(
samples_amount * self.inputs_amount,
CL_MEM_READ_WRITE,
state,
)?;
ExecuteKernel::new(kernel)
.set_arg(self.weights_buffer.as_ref().unwrap())
.set_arg(layer_output_to_error_derivative)
.set_arg(&loss_to_input_derivatives)
.set_arg(&(samples_amount as cl_int))
.set_arg(&(self.outputs_amount as cl_int))
.set_arg(&(self.inputs_amount as cl_int))
.set_global_work_sizes(&[samples_amount, self.inputs_amount])
.enqueue_nd_range(queue)?;
queue.finish()?;
Ok(loss_to_input_derivatives)
}
}
#[cfg(test)]
mod dense_tests {
use std::ptr;
use opencl3::{
command_queue::{CL_BLOCKING, CL_NON_BLOCKING},
device::cl_float,
memory::{Buffer, CL_MEM_READ_ONLY},
};
use rand::{thread_rng, Rng};
use crate::{
layers::{dense::Dense, Layer},
utils::{
opencl::{BufferLike, DeviceType},
setup_opencl,
},
};
#[test]
fn should_apply_gradients_correctly() -> () {
let state = setup_opencl(DeviceType::GPU).unwrap();
let inputs_amount = 10;
let outputs_amount = 10;
let mut gpu_dense = Dense::new_raw(inputs_amount, outputs_amount);
gpu_dense.init(&state).unwrap();
let mut rng = thread_rng();
let loss_to_output_derivatives: Vec<f32> = (0..outputs_amount)
.map(|_| rng.gen_range(-134_f32..314_f32))
.collect();
let inputs: Vec<f32> = (0..inputs_amount)
.map(|_| rng.gen_range(-134_f32..314_f32))
.collect();
let expected_gradients: Vec<Vec<f32>> = (0..inputs_amount)
.map(|input_index| {
(0..outputs_amount)
.map(|output_index| {
let loss_to_output_derivative = loss_to_output_derivatives[output_index];
let input = inputs[input_index];
loss_to_output_derivative * input
})
.collect()
})
.collect();
let expected_bias_gradients: Vec<f32> = loss_to_output_derivatives.to_vec();
let input_samples_buffer = inputs.to_buffer(true, &state).unwrap();
gpu_dense.last_inputs_buffer = Some(input_samples_buffer);
let loss_to_output_derivatives_buffer =
loss_to_output_derivatives.to_buffer(true, &state).unwrap();
let actual_gradients = gpu_dense
.compute_gradients(&loss_to_output_derivatives_buffer)
.unwrap();
let flat_actual_weights_gradients =
Vec::<f32>::from_buffer(&actual_gradients[0].value, true, &state).unwrap();
let actual_weights_gradients: Vec<Vec<f32>> = (0..inputs_amount)
.map(|input_index| {
(0..outputs_amount)
.map(|output_index| {
let i = input_index * outputs_amount + output_index;
flat_actual_weights_gradients[i]
})
.collect()
})
.collect();
let actual_bias_gradients =
Vec::<f32>::from_buffer(&actual_gradients[1].value, true, &state).unwrap();
{
expected_gradients
.iter()
.zip(actual_weights_gradients)
.for_each(
|(input_to_output_gradients, actual_input_to_output_gradients)| {
input_to_output_gradients
.iter()
.zip(actual_input_to_output_gradients)
.for_each(|(expected_gradient, gradient)| {
assert!(
(expected_gradient - gradient).abs()
/ expected_gradient.max(gradient)
<= 0.0001
);
});
},
);
};
{
expected_bias_gradients
.iter()
.zip(actual_bias_gradients)
.for_each(|(expected_bias, bias)| {
assert!((expected_bias - bias).abs() / expected_bias.max(bias) <= 0.0001);
})
};
}
#[test]
fn should_propagate_to_correct_value() {
let state = setup_opencl(DeviceType::GPU).unwrap();
let queue = state.queues.first().unwrap();
let context = &state.context;
let samples_amount = 4;
let inputs_amount = 5;
let outputs_amount = 5;
let mut gpu_dense: Dense = Dense::new_raw(inputs_amount, outputs_amount);
gpu_dense.init(&state).unwrap();
let mut rng = thread_rng();
let input_samples: Vec<Vec<f32>> = (0..samples_amount)
.into_iter()
.map(|_| {
(0..inputs_amount)
.into_iter()
.map(|_| rng.gen_range(-1231.0_f32..=15151.0_f32))
.collect()
})
.collect();
let mut expected_outputs = vec![vec![0.0; outputs_amount]; samples_amount];
input_samples.iter().enumerate().for_each(|(i, inputs)| {
for (j, input_to_outputs) in gpu_dense.weights.iter().enumerate() {
for (k, weight) in input_to_outputs.iter().enumerate() {
expected_outputs[i][k] += weight * inputs[j]; }
}
for (k, bias) in gpu_dense.biases.iter().enumerate() {
expected_outputs[i][k] += bias;
}
});
let mut input_samples_buffer = Buffer::<cl_float>::create(
&context,
CL_MEM_READ_ONLY,
samples_amount * inputs_amount,
ptr::null_mut(),
)
.unwrap();
let input_samples_gpu_write_event = queue
.enqueue_write_buffer(
&mut input_samples_buffer,
CL_BLOCKING,
0,
input_samples
.iter()
.map(|x| x.to_vec())
.flatten()
.collect::<Vec<f32>>()
.as_slice(),
&[],
)
.unwrap();
input_samples_gpu_write_event.wait().unwrap();
let gpu_outputs_buffer = gpu_dense.propagate(&input_samples_buffer).unwrap();
let mut outputs_vec = vec![0.0; samples_amount * outputs_amount];
let gpu_flattend_outputs = outputs_vec.as_mut_slice();
let read_flattened_outputs_gpu = queue
.enqueue_read_buffer(
&gpu_outputs_buffer,
CL_NON_BLOCKING,
0,
gpu_flattend_outputs,
&[],
)
.unwrap();
read_flattened_outputs_gpu.wait().unwrap();
let flattened_expected_outputs: Vec<f32> = expected_outputs
.iter()
.map(|x| x.to_vec())
.flatten()
.collect();
{
let a = &outputs_vec;
let b = &flattened_expected_outputs;
let max_dist = 0.01;
assert_eq!(a.len(), b.len());
a.iter().zip(b).for_each(|(x, y)| {
assert!((x - y).abs() / x.max(*y) <= max_dist);
});
};
}
}