use super::{
BufferHandle, GpuBackendTrait, MemoryManagerTrait, NeuralIntegrationError,
NeuralOperation, NeuralResult, ActivationFunction,
};
use std::sync::Arc;
pub fn execute_operation<T>(
operation: NeuralOperation<T>,
inputs: &[T],
backend: &Arc<dyn GpuBackendTrait>,
memory_manager: &Arc<dyn MemoryManagerTrait>,
) -> NeuralResult<Vec<T>>
where
T: Clone + Send + Sync + 'static + bytemuck::Pod,
{
match operation {
NeuralOperation::MatrixMultiply { a_rows, a_cols, b_cols, _phantom } => {
execute_matrix_multiply(a_rows, a_cols, b_cols, inputs, backend, memory_manager)
}
NeuralOperation::VectorAdd { size, _phantom } => {
execute_vector_add(size, inputs, backend, memory_manager)
}
NeuralOperation::ActivationFunction { function, size, _phantom } => {
execute_activation_function(function, size, inputs, backend, memory_manager)
}
NeuralOperation::Convolution { channels, kernel_size, stride, _phantom } => {
execute_convolution(channels, kernel_size, stride, inputs, backend, memory_manager)
}
NeuralOperation::ForwardPropagation { layer_sizes, _phantom } => {
execute_forward_propagation(&layer_sizes, inputs, backend, memory_manager)
}
NeuralOperation::BackwardPropagation { layer_sizes, _phantom } => {
execute_backward_propagation(&layer_sizes, inputs, backend, memory_manager)
}
NeuralOperation::Custom { kernel_source, name, _phantom } => {
execute_custom_kernel(&kernel_source, &name, inputs, backend, memory_manager)
}
}
}
pub fn process_batch<T>(
operations: Vec<NeuralOperation<T>>,
inputs: Vec<Vec<T>>,
backend: &Option<Arc<dyn GpuBackendTrait>>,
memory_manager: &Arc<dyn MemoryManagerTrait>,
batch_size: usize,
) -> NeuralResult<Vec<Vec<T>>>
where
T: Clone + Send + Sync + 'static + bytemuck::Pod + num_traits::Float,
{
if operations.len() != inputs.len() {
return Err(NeuralIntegrationError::OperationError(
"Operations and inputs count mismatch".to_string()
));
}
let mut results = Vec::with_capacity(operations.len());
if let Some(backend) = backend {
for chunk in operations.chunks(batch_size).zip(inputs.chunks(batch_size)) {
let (ops, ins) = chunk;
let batch_results = execute_batch_gpu(ops, ins, backend, memory_manager)?;
results.extend(batch_results);
}
} else {
for (operation, input) in operations.into_iter().zip(inputs.into_iter()) {
let result = super::bridge::execute_cpu_fallback(operation, &input)?;
results.push(result);
}
}
Ok(results)
}
fn execute_matrix_multiply<T>(
a_rows: usize,
a_cols: usize,
b_cols: usize,
inputs: &[T],
backend: &Arc<dyn GpuBackendTrait>,
memory_manager: &Arc<dyn MemoryManagerTrait>,
) -> NeuralResult<Vec<T>>
where
T: Clone + Send + Sync + 'static + bytemuck::Pod,
{
let expected_size = a_rows * a_cols + a_cols * b_cols;
if inputs.len() < expected_size {
return Err(NeuralIntegrationError::OperationError(
format!("Expected {} elements, got {}", expected_size, inputs.len())
));
}
let input_bytes: &[u8] = bytemuck::cast_slice(inputs);
let input_buffer = memory_manager.transfer_to_gpu(
&inputs.iter().map(|&x| unsafe { std::mem::transmute_copy(&x) }).collect::<Vec<f32>>()
)?;
let kernel_source = generate_matrix_multiply_kernel(a_rows, a_cols, b_cols);
let kernel = super::bridge::extract_wgsl_from_rust(&kernel_source)?;
let output_buffer = backend.execute_kernel(&kernel, &[input_buffer])?;
let result_f32 = memory_manager.transfer_from_gpu(output_buffer)?;
let result: Vec<T> = result_f32.iter()
.map(|&x| unsafe { std::mem::transmute_copy(&x) })
.collect();
Ok(result)
}
fn execute_vector_add<T>(
size: usize,
inputs: &[T],
backend: &Arc<dyn GpuBackendTrait>,
memory_manager: &Arc<dyn MemoryManagerTrait>,
) -> NeuralResult<Vec<T>>
where
T: Clone + Send + Sync + 'static + bytemuck::Pod,
{
if inputs.len() < size * 2 {
return Err(NeuralIntegrationError::OperationError(
format!("Expected {} elements, got {}", size * 2, inputs.len())
));
}
let input_buffer = memory_manager.transfer_to_gpu(
&inputs.iter().map(|&x| unsafe { std::mem::transmute_copy(&x) }).collect::<Vec<f32>>()
)?;
let kernel_source = generate_vector_add_kernel(size);
let kernel = super::bridge::extract_wgsl_from_rust(&kernel_source)?;
let output_buffer = backend.execute_kernel(&kernel, &[input_buffer])?;
let result_f32 = memory_manager.transfer_from_gpu(output_buffer)?;
let result: Vec<T> = result_f32.iter()
.take(size)
.map(|&x| unsafe { std::mem::transmute_copy(&x) })
.collect();
Ok(result)
}
fn execute_activation_function<T>(
function: ActivationFunction,
size: usize,
inputs: &[T],
backend: &Arc<dyn GpuBackendTrait>,
memory_manager: &Arc<dyn MemoryManagerTrait>,
) -> NeuralResult<Vec<T>>
where
T: Clone + Send + Sync + 'static + bytemuck::Pod,
{
if inputs.len() < size {
return Err(NeuralIntegrationError::OperationError(
format!("Expected {} elements, got {}", size, inputs.len())
));
}
let input_buffer = memory_manager.transfer_to_gpu(
&inputs.iter().take(size).map(|&x| unsafe { std::mem::transmute_copy(&x) }).collect::<Vec<f32>>()
)?;
let kernel_source = generate_activation_kernel(function, size);
let kernel = super::bridge::extract_wgsl_from_rust(&kernel_source)?;
let output_buffer = backend.execute_kernel(&kernel, &[input_buffer])?;
let result_f32 = memory_manager.transfer_from_gpu(output_buffer)?;
let result: Vec<T> = result_f32.iter()
.take(size)
.map(|&x| unsafe { std::mem::transmute_copy(&x) })
.collect();
Ok(result)
}
fn execute_convolution<T>(
channels: usize,
kernel_size: usize,
stride: usize,
inputs: &[T],
backend: &Arc<dyn GpuBackendTrait>,
memory_manager: &Arc<dyn MemoryManagerTrait>,
) -> NeuralResult<Vec<T>>
where
T: Clone + Send + Sync + 'static + bytemuck::Pod,
{
let input_buffer = memory_manager.transfer_to_gpu(
&inputs.iter().map(|&x| unsafe { std::mem::transmute_copy(&x) }).collect::<Vec<f32>>()
)?;
let kernel_source = generate_convolution_kernel(channels, kernel_size, stride);
let kernel = super::bridge::extract_wgsl_from_rust(&kernel_source)?;
let output_buffer = backend.execute_kernel(&kernel, &[input_buffer])?;
let result_f32 = memory_manager.transfer_from_gpu(output_buffer)?;
let result: Vec<T> = result_f32.iter()
.map(|&x| unsafe { std::mem::transmute_copy(&x) })
.collect();
Ok(result)
}
fn execute_forward_propagation<T>(
layer_sizes: &[usize],
inputs: &[T],
backend: &Arc<dyn GpuBackendTrait>,
memory_manager: &Arc<dyn MemoryManagerTrait>,
) -> NeuralResult<Vec<T>>
where
T: Clone + Send + Sync + 'static + bytemuck::Pod,
{
let input_buffer = memory_manager.transfer_to_gpu(
&inputs.iter().map(|&x| unsafe { std::mem::transmute_copy(&x) }).collect::<Vec<f32>>()
)?;
let kernel_source = generate_forward_propagation_kernel(layer_sizes);
let kernel = super::bridge::extract_wgsl_from_rust(&kernel_source)?;
let output_buffer = backend.execute_kernel(&kernel, &[input_buffer])?;
let result_f32 = memory_manager.transfer_from_gpu(output_buffer)?;
let result: Vec<T> = result_f32.iter()
.map(|&x| unsafe { std::mem::transmute_copy(&x) })
.collect();
Ok(result)
}
fn execute_backward_propagation<T>(
layer_sizes: &[usize],
inputs: &[T],
backend: &Arc<dyn GpuBackendTrait>,
memory_manager: &Arc<dyn MemoryManagerTrait>,
) -> NeuralResult<Vec<T>>
where
T: Clone + Send + Sync + 'static + bytemuck::Pod,
{
let input_buffer = memory_manager.transfer_to_gpu(
&inputs.iter().map(|&x| unsafe { std::mem::transmute_copy(&x) }).collect::<Vec<f32>>()
)?;
let kernel_source = generate_backward_propagation_kernel(layer_sizes);
let kernel = super::bridge::extract_wgsl_from_rust(&kernel_source)?;
let output_buffer = backend.execute_kernel(&kernel, &[input_buffer])?;
let result_f32 = memory_manager.transfer_from_gpu(output_buffer)?;
let result: Vec<T> = result_f32.iter()
.map(|&x| unsafe { std::mem::transmute_copy(&x) })
.collect();
Ok(result)
}
fn execute_custom_kernel<T>(
kernel_source: &str,
name: &str,
inputs: &[T],
backend: &Arc<dyn GpuBackendTrait>,
memory_manager: &Arc<dyn MemoryManagerTrait>,
) -> NeuralResult<Vec<T>>
where
T: Clone + Send + Sync + 'static + bytemuck::Pod,
{
let input_buffer = memory_manager.transfer_to_gpu(
&inputs.iter().map(|&x| unsafe { std::mem::transmute_copy(&x) }).collect::<Vec<f32>>()
)?;
let kernel = super::bridge::extract_wgsl_from_rust(kernel_source)?;
let output_buffer = backend.execute_kernel(&kernel, &[input_buffer])?;
let result_f32 = memory_manager.transfer_from_gpu(output_buffer)?;
let result: Vec<T> = result_f32.iter()
.map(|&x| unsafe { std::mem::transmute_copy(&x) })
.collect();
Ok(result)
}
fn execute_batch_gpu<T>(
operations: &[NeuralOperation<T>],
inputs: &[Vec<T>],
backend: &Arc<dyn GpuBackendTrait>,
memory_manager: &Arc<dyn MemoryManagerTrait>,
) -> NeuralResult<Vec<Vec<T>>>
where
T: Clone + Send + Sync + 'static + bytemuck::Pod,
{
let mut results = Vec::with_capacity(operations.len());
for (operation, input) in operations.iter().zip(inputs.iter()) {
let result = execute_operation(operation.clone(), input, backend, memory_manager)?;
results.push(result);
}
Ok(results)
}
fn generate_matrix_multiply_kernel(a_rows: usize, a_cols: usize, b_cols: usize) -> String {
format!(r#"
__global__ void matrix_multiply(float* a, float* b, float* c) {{
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
if (row < {a_rows} && col < {b_cols}) {{
float sum = 0.0f;
for (int k = 0; k < {a_cols}; k++) {{
sum += a[row * {a_cols} + k] * b[k * {b_cols} + col];
}}
c[row * {b_cols} + col] = sum;
}}
}}
"#)
}
fn generate_vector_add_kernel(size: usize) -> String {
format!(r#"
__global__ void vector_add(float* a, float* b, float* c) {{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < {size}) {{
c[i] = a[i] + b[i];
}}
}}
"#)
}
fn generate_activation_kernel(function: ActivationFunction, size: usize) -> String {
let activation_code = match function {
ActivationFunction::Sigmoid => "1.0f / (1.0f + expf(-x))",
ActivationFunction::ReLU => "fmaxf(0.0f, x)",
ActivationFunction::Tanh => "tanhf(x)",
ActivationFunction::LeakyReLU => "x > 0.0f ? x : 0.01f * x",
ActivationFunction::Swish => "x / (1.0f + expf(-x))",
ActivationFunction::GELU => "0.5f * x * (1.0f + tanhf(0.7978845608f * (x + 0.044715f * x * x * x)))",
};
format!(r#"
__global__ void activation_function(float* input, float* output) {{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < {size}) {{
float x = input[i];
output[i] = {activation_code};
}}
}}
"#)
}
fn generate_convolution_kernel(channels: usize, kernel_size: usize, stride: usize) -> String {
format!(r#"
__global__ void convolution(float* input, float* kernel, float* output,
int input_width, int input_height, int output_width, int output_height) {{
int out_x = blockIdx.x * blockDim.x + threadIdx.x;
int out_y = blockIdx.y * blockDim.y + threadIdx.y;
int channel = blockIdx.z;
if (out_x < output_width && out_y < output_height && channel < {channels}) {{
float sum = 0.0f;
for (int ky = 0; ky < {kernel_size}; ky++) {{
for (int kx = 0; kx < {kernel_size}; kx++) {{
int in_x = out_x * {stride} + kx;
int in_y = out_y * {stride} + ky;
if (in_x < input_width && in_y < input_height) {{
int input_idx = channel * input_width * input_height + in_y * input_width + in_x;
int kernel_idx = channel * {kernel_size} * {kernel_size} + ky * {kernel_size} + kx;
sum += input[input_idx] * kernel[kernel_idx];
}}
}}
}}
int output_idx = channel * output_width * output_height + out_y * output_width + out_x;
output[output_idx] = sum;
}}
}}
"#)
}
fn generate_forward_propagation_kernel(layer_sizes: &[usize]) -> String {
let num_layers = layer_sizes.len();
let weights_calculation = (0..num_layers-1).map(|i| {
format!(r#"
// Layer {} to {}
if (layer == {}) {{
for (int j = 0; j < {}; j++) {{
float sum = 0.0f;
for (int k = 0; k < {}; k++) {{
sum += activations[prev_layer_offset + k] * weights[weight_offset + j * {} + k];
}}
activations[current_layer_offset + j] = 1.0f / (1.0f + expf(-sum)); // Sigmoid
}}
}}
"#, i, i+1, i, layer_sizes[i+1], layer_sizes[i], layer_sizes[i])
}).collect::<Vec<_>>().join("\n");
format!(r#"
__global__ void forward_propagation(float* inputs, float* weights, float* biases, float* activations) {{
int neuron = blockIdx.x * blockDim.x + threadIdx.x;
int layer = blockIdx.y;
// Copy inputs to first layer
if (layer == 0 && neuron < {}) {{
activations[neuron] = inputs[neuron];
return;
}}
// Calculate layer offsets
int prev_layer_offset = 0;
int current_layer_offset = 0;
int weight_offset = 0;
for (int i = 0; i < layer; i++) {{
if (i < layer - 1) prev_layer_offset += layer_sizes[i];
current_layer_offset += layer_sizes[i];
if (i < layer - 1) weight_offset += layer_sizes[i] * layer_sizes[i + 1];
}}
{}
}}
"#, layer_sizes[0], weights_calculation)
}
fn generate_backward_propagation_kernel(layer_sizes: &[usize]) -> String {
format!(r#"
__global__ void backward_propagation(float* activations, float* weights,
float* errors, float* gradients, float* targets) {{
int neuron = blockIdx.x * blockDim.x + threadIdx.x;
int layer = blockIdx.y;
// Calculate output layer errors
if (layer == {} - 1) {{
if (neuron < {}) {{
float output = activations[neuron]; // Assuming output layer offset
float target = targets[neuron];
errors[neuron] = (output - target) * output * (1.0f - output); // Sigmoid derivative
}}
return;
}}
// Backpropagate errors for hidden layers
// Implementation depends on network architecture
// This is a simplified version
if (layer > 0 && neuron < layer_sizes[layer]) {{
float error_sum = 0.0f;
for (int next_neuron = 0; next_neuron < layer_sizes[layer + 1]; next_neuron++) {{
error_sum += errors[next_neuron] * weights[neuron * layer_sizes[layer + 1] + next_neuron];
}}
float activation = activations[neuron];
errors[neuron] = error_sum * activation * (1.0f - activation);
}}
}}
"#, layer_sizes.len(), layer_sizes.last().unwrap_or(&0))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_kernel_generation() {
let kernel = generate_matrix_multiply_kernel(4, 4, 4);
assert!(kernel.contains("matrix_multiply"));
assert!(kernel.contains("blockIdx"));
assert!(kernel.contains("threadIdx"));
}
#[test]
fn test_activation_kernel_generation() {
let kernel = generate_activation_kernel(ActivationFunction::ReLU, 128);
assert!(kernel.contains("fmaxf"));
assert!(kernel.contains("activation_function"));
}
#[test]
fn test_vector_add_kernel() {
let kernel = generate_vector_add_kernel(256);
assert!(kernel.contains("vector_add"));
assert!(kernel.contains("256"));
}
}