use ghostflow_core::Tensor;
use std::sync::{Arc, Mutex};
#[derive(Clone, Copy, Debug, PartialEq)]
pub enum DeviceType {
CPU,
CUDA,
OpenCL,
Metal,
Vulkan,
}
#[derive(Clone, Debug)]
pub struct DeviceInfo {
pub device_type: DeviceType,
pub device_id: usize,
pub name: String,
pub compute_capability: (u32, u32),
pub total_memory: usize,
pub available_memory: usize,
}
impl DeviceInfo {
pub fn cpu() -> Self {
DeviceInfo {
device_type: DeviceType::CPU,
device_id: 0,
name: "CPU".to_string(),
compute_capability: (0, 0),
total_memory: 0,
available_memory: 0,
}
}
pub fn cuda(device_id: usize) -> Self {
DeviceInfo {
device_type: DeviceType::CUDA,
device_id,
name: format!("CUDA Device {}", device_id),
compute_capability: (7, 5), total_memory: 8 * 1024 * 1024 * 1024, available_memory: 7 * 1024 * 1024 * 1024,
}
}
}
pub struct GPUContext {
device: DeviceInfo,
#[allow(dead_code)]
stream: Option<usize>,
#[allow(dead_code)]
memory_pool: Arc<Mutex<Vec<Vec<f32>>>>,
}
impl GPUContext {
pub fn new(device: DeviceInfo) -> Self {
GPUContext {
device,
stream: None,
memory_pool: Arc::new(Mutex::new(Vec::new())),
}
}
pub fn cpu() -> Self {
Self::new(DeviceInfo::cpu())
}
pub fn cuda(device_id: usize) -> Result<Self, String> {
if Self::is_cuda_available() {
Ok(Self::new(DeviceInfo::cuda(device_id)))
} else {
Err("CUDA not available".to_string())
}
}
pub fn is_cuda_available() -> bool {
false
}
pub fn is_opencl_available() -> bool {
false
}
pub fn device_count() -> usize {
if Self::is_cuda_available() { 1 } else { 0 }
}
pub fn device_info(&self) -> &DeviceInfo {
&self.device
}
pub fn synchronize(&self) {
}
pub fn allocate(&self, size: usize) -> Vec<f32> {
vec![0.0f32; size]
}
pub fn deallocate(&self, _buffer: Vec<f32>) {
}
}
pub struct GPUTensor {
data: Vec<f32>,
dims: Vec<usize>,
device: DeviceType,
context: Arc<GPUContext>,
}
impl GPUTensor {
pub fn new(data: Vec<f32>, dims: Vec<usize>, context: Arc<GPUContext>) -> Self {
GPUTensor {
data,
dims,
device: context.device.device_type,
context,
}
}
pub fn from_tensor(tensor: &Tensor, context: Arc<GPUContext>) -> Self {
GPUTensor::new(
tensor.data_f32().to_vec(),
tensor.dims().to_vec(),
context,
)
}
pub fn to_tensor(&self) -> Tensor {
Tensor::from_slice(&self.data, &self.dims).unwrap()
}
pub fn to_device(&mut self, device: DeviceType) {
if self.device == device {
return;
}
self.device = device;
}
pub fn dims(&self) -> &[usize] {
&self.dims
}
pub fn device(&self) -> DeviceType {
self.device
}
pub fn matmul(&self, other: &GPUTensor) -> GPUTensor {
assert_eq!(self.dims.len(), 2);
assert_eq!(other.dims.len(), 2);
assert_eq!(self.dims[1], other.dims[0]);
let m = self.dims[0];
let k = self.dims[1];
let n = other.dims[1];
let mut result = vec![0.0f32; m * n];
for i in 0..m {
for j in 0..n {
let mut sum = 0.0f32;
for p in 0..k {
sum += self.data[i * k + p] * other.data[p * n + j];
}
result[i * n + j] = sum;
}
}
GPUTensor::new(result, vec![m, n], self.context.clone())
}
pub fn add(&self, other: &GPUTensor) -> GPUTensor {
assert_eq!(self.dims, other.dims);
let result: Vec<f32> = self.data.iter()
.zip(other.data.iter())
.map(|(&a, &b)| a + b)
.collect();
GPUTensor::new(result, self.dims.clone(), self.context.clone())
}
pub fn mul(&self, other: &GPUTensor) -> GPUTensor {
assert_eq!(self.dims, other.dims);
let result: Vec<f32> = self.data.iter()
.zip(other.data.iter())
.map(|(&a, &b)| a * b)
.collect();
GPUTensor::new(result, self.dims.clone(), self.context.clone())
}
pub fn scale(&self, scalar: f32) -> GPUTensor {
let result: Vec<f32> = self.data.iter()
.map(|&x| x * scalar)
.collect();
GPUTensor::new(result, self.dims.clone(), self.context.clone())
}
pub fn relu(&self) -> GPUTensor {
let result: Vec<f32> = self.data.iter()
.map(|&x| x.max(0.0))
.collect();
GPUTensor::new(result, self.dims.clone(), self.context.clone())
}
pub fn softmax(&self) -> GPUTensor {
assert_eq!(self.dims.len(), 2);
let batch_size = self.dims[0];
let features = self.dims[1];
let mut result = vec![0.0f32; self.data.len()];
for b in 0..batch_size {
let start = b * features;
let end = start + features;
let batch_data = &self.data[start..end];
let max_val = batch_data.iter().fold(f32::NEG_INFINITY, |a, &b| a.max(b));
let mut sum = 0.0f32;
for i in 0..features {
let exp_val = (batch_data[i] - max_val).exp();
result[start + i] = exp_val;
sum += exp_val;
}
for i in 0..features {
result[start + i] /= sum;
}
}
GPUTensor::new(result, self.dims.clone(), self.context.clone())
}
pub fn sum(&self) -> f32 {
self.data.iter().sum()
}
pub fn mean(&self) -> f32 {
self.sum() / self.data.len() as f32
}
}
pub struct GPUOps {
context: Arc<GPUContext>,
}
impl GPUOps {
pub fn new(context: Arc<GPUContext>) -> Self {
GPUOps { context }
}
pub fn conv2d(
&self,
input: &GPUTensor,
kernel: &GPUTensor,
stride: (usize, usize),
padding: (usize, usize),
) -> GPUTensor {
let input_dims = input.dims();
let kernel_dims = kernel.dims();
assert_eq!(input_dims.len(), 4); assert_eq!(kernel_dims.len(), 4);
let batch = input_dims[0];
let _in_channels = input_dims[1];
let in_h = input_dims[2];
let in_w = input_dims[3];
let out_channels = kernel_dims[0];
let kh = kernel_dims[2];
let kw = kernel_dims[3];
let out_h = (in_h + 2 * padding.0 - kh) / stride.0 + 1;
let out_w = (in_w + 2 * padding.1 - kw) / stride.1 + 1;
let output_size = batch * out_channels * out_h * out_w;
let output = vec![0.0f32; output_size];
GPUTensor::new(
output,
vec![batch, out_channels, out_h, out_w],
self.context.clone(),
)
}
pub fn batch_norm(
&self,
input: &GPUTensor,
gamma: &GPUTensor,
beta: &GPUTensor,
running_mean: &GPUTensor,
running_var: &GPUTensor,
eps: f32,
) -> GPUTensor {
let dims = input.dims();
let channels = dims[1];
let spatial_size: usize = dims[2..].iter().product();
let mut output = input.data.clone();
for c in 0..channels {
let mean = running_mean.data[c];
let var = running_var.data[c];
let std = (var + eps).sqrt();
for b in 0..dims[0] {
for s in 0..spatial_size {
let idx = (b * channels + c) * spatial_size + s;
output[idx] = (output[idx] - mean) / std;
output[idx] = gamma.data[c] * output[idx] + beta.data[c];
}
}
}
GPUTensor::new(output, dims.to_vec(), self.context.clone())
}
pub fn max_pool2d(
&self,
input: &GPUTensor,
kernel_size: (usize, usize),
stride: (usize, usize),
) -> GPUTensor {
let dims = input.dims();
assert_eq!(dims.len(), 4);
let batch = dims[0];
let channels = dims[1];
let in_h = dims[2];
let in_w = dims[3];
let out_h = (in_h - kernel_size.0) / stride.0 + 1;
let out_w = (in_w - kernel_size.1) / stride.1 + 1;
let mut output = vec![f32::NEG_INFINITY; batch * channels * out_h * out_w];
for b in 0..batch {
for c in 0..channels {
for oh in 0..out_h {
for ow in 0..out_w {
let mut max_val = f32::NEG_INFINITY;
for kh in 0..kernel_size.0 {
for kw in 0..kernel_size.1 {
let ih = oh * stride.0 + kh;
let iw = ow * stride.1 + kw;
if ih < in_h && iw < in_w {
let in_idx = ((b * channels + c) * in_h + ih) * in_w + iw;
max_val = max_val.max(input.data[in_idx]);
}
}
}
let out_idx = ((b * channels + c) * out_h + oh) * out_w + ow;
output[out_idx] = max_val;
}
}
}
}
GPUTensor::new(
output,
vec![batch, channels, out_h, out_w],
self.context.clone(),
)
}
}
pub struct GPUMemoryManager {
context: Arc<GPUContext>,
allocated: Arc<Mutex<usize>>,
peak: Arc<Mutex<usize>>,
}
impl GPUMemoryManager {
pub fn new(context: Arc<GPUContext>) -> Self {
GPUMemoryManager {
context,
allocated: Arc::new(Mutex::new(0)),
peak: Arc::new(Mutex::new(0)),
}
}
pub fn allocate(&self, size: usize) -> Vec<f32> {
let mut allocated = self.allocated.lock().unwrap();
*allocated += size * std::mem::size_of::<f32>();
let mut peak = self.peak.lock().unwrap();
*peak = (*peak).max(*allocated);
self.context.allocate(size)
}
pub fn deallocate(&self, buffer: Vec<f32>) {
let size = buffer.len() * std::mem::size_of::<f32>();
let mut allocated = self.allocated.lock().unwrap();
*allocated = allocated.saturating_sub(size);
self.context.deallocate(buffer);
}
pub fn allocated_memory(&self) -> usize {
*self.allocated.lock().unwrap()
}
pub fn peak_memory(&self) -> usize {
*self.peak.lock().unwrap()
}
pub fn reset_peak(&self) {
let mut peak = self.peak.lock().unwrap();
*peak = *self.allocated.lock().unwrap();
}
}
pub struct AutoMixedPrecision {
enabled: bool,
loss_scale: f32,
growth_factor: f32,
backoff_factor: f32,
growth_interval: usize,
iterations: usize,
}
impl AutoMixedPrecision {
pub fn new() -> Self {
AutoMixedPrecision {
enabled: false,
loss_scale: 65536.0,
growth_factor: 2.0,
backoff_factor: 0.5,
growth_interval: 2000,
iterations: 0,
}
}
pub fn enable(mut self) -> Self {
self.enabled = true;
self
}
pub fn scale_loss(&mut self, loss: f32) -> f32 {
if self.enabled {
loss * self.loss_scale
} else {
loss
}
}
pub fn unscale_gradients(&mut self, gradients: &mut [f32]) {
if self.enabled {
for grad in gradients {
*grad /= self.loss_scale;
}
}
}
pub fn update_scale(&mut self, found_inf: bool) {
if !self.enabled {
return;
}
self.iterations += 1;
if found_inf {
self.loss_scale *= self.backoff_factor;
self.iterations = 0;
} else if self.iterations >= self.growth_interval {
self.loss_scale *= self.growth_factor;
self.iterations = 0;
}
}
}
impl Default for AutoMixedPrecision {
fn default() -> Self { Self::new() }
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_gpu_context() {
let ctx = GPUContext::cpu();
assert_eq!(ctx.device_info().device_type, DeviceType::CPU);
}
#[test]
fn test_gpu_tensor() {
let ctx = Arc::new(GPUContext::cpu());
let data = vec![1.0, 2.0, 3.0, 4.0];
let tensor = GPUTensor::new(data, vec![2, 2], ctx);
assert_eq!(tensor.dims(), &[2, 2]);
assert_eq!(tensor.sum(), 10.0);
assert_eq!(tensor.mean(), 2.5);
}
#[test]
fn test_gpu_tensor_ops() {
let ctx = Arc::new(GPUContext::cpu());
let a = GPUTensor::new(vec![1.0, 2.0, 3.0, 4.0], vec![2, 2], ctx.clone());
let b = GPUTensor::new(vec![2.0, 2.0, 2.0, 2.0], vec![2, 2], ctx.clone());
let c = a.add(&b);
assert_eq!(c.data, vec![3.0, 4.0, 5.0, 6.0]);
let d = a.scale(2.0);
assert_eq!(d.data, vec![2.0, 4.0, 6.0, 8.0]);
}
#[test]
fn test_gpu_matmul() {
let ctx = Arc::new(GPUContext::cpu());
let a = GPUTensor::new(vec![1.0, 2.0, 3.0, 4.0], vec![2, 2], ctx.clone());
let b = GPUTensor::new(vec![1.0, 0.0, 0.0, 1.0], vec![2, 2], ctx.clone());
let c = a.matmul(&b);
assert_eq!(c.dims(), &[2, 2]);
}
#[test]
fn test_gpu_relu() {
let ctx = Arc::new(GPUContext::cpu());
let tensor = GPUTensor::new(vec![-1.0, 2.0, -3.0, 4.0], vec![2, 2], ctx);
let result = tensor.relu();
assert_eq!(result.data, vec![0.0, 2.0, 0.0, 4.0]);
}
#[test]
fn test_memory_manager() {
let ctx = Arc::new(GPUContext::cpu());
let manager = GPUMemoryManager::new(ctx);
let buffer = manager.allocate(100);
assert!(manager.allocated_memory() > 0);
manager.deallocate(buffer);
}
#[test]
fn test_amp() {
let mut amp = AutoMixedPrecision::new().enable();
let loss = 1.0;
let scaled_loss = amp.scale_loss(loss);
assert!(scaled_loss > loss);
let mut grads = vec![1.0, 2.0, 3.0];
amp.unscale_gradients(&mut grads);
assert!(grads[0] < 1.0);
}
}