use crate::traits::SimdError;
#[cfg(feature = "no-std")]
use core::any::Any;
#[cfg(not(feature = "no-std"))]
use std::any::Any;
#[cfg(feature = "no-std")]
use alloc::{
boxed::Box,
collections::BTreeMap as HashMap,
string::{String, ToString},
vec,
vec::Vec,
};
#[cfg(not(feature = "no-std"))]
use std::{collections::HashMap, string::ToString};
#[cfg(feature = "no-std")]
extern crate alloc;
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub enum AcceleratorType {
ASIC,
Custom,
DSP,
VPU, NPU, DPU, AI, Crypto, Signal, Matrix, }
#[derive(Debug, Clone)]
pub struct AcceleratorCapabilities {
pub supported_operations: Vec<AcceleratorOperation>,
pub data_types: Vec<AcceleratorDataType>,
pub max_batch_size: usize,
pub memory_mb: u64,
pub compute_units: u32,
pub peak_performance_ops: u64,
pub power_consumption_w: f64,
pub precision_modes: Vec<AcceleratorPrecision>,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum AcceleratorOperation {
MatrixMultiply,
Convolution,
Activation,
Pooling,
Normalization,
Attention,
Embedding,
Reduction,
Transform,
Sort,
Search,
Compress,
Encrypt,
Hash,
Custom(u32),
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum AcceleratorDataType {
F32,
F16,
BF16,
I32,
I16,
I8,
U32,
U16,
U8,
Bool,
Custom(u32),
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum AcceleratorPrecision {
High,
Medium,
Low,
Mixed,
Adaptive,
}
#[derive(Debug, Clone)]
pub struct AcceleratorDevice {
pub id: u32,
pub name: String,
pub vendor: String,
pub model: String,
pub accelerator_type: AcceleratorType,
pub capabilities: AcceleratorCapabilities,
pub driver_version: String,
pub firmware_version: String,
pub pci_id: Option<String>,
pub numa_node: Option<u32>,
}
#[derive(Debug)]
pub struct AcceleratorBuffer<T> {
pub ptr: *mut T,
pub size: usize,
pub device: AcceleratorDevice,
pub alignment: usize,
pub memory_type: AcceleratorMemoryType,
#[allow(dead_code)] backend_handle: Option<Box<dyn Any + Send + Sync>>,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum AcceleratorMemoryType {
Device,
Host,
Unified,
Pinned,
Cached,
Uncached,
}
unsafe impl<T: Send> Send for AcceleratorBuffer<T> {}
unsafe impl<T: Sync> Sync for AcceleratorBuffer<T> {}
impl<T> Drop for AcceleratorBuffer<T> {
fn drop(&mut self) {
}
}
pub struct AcceleratorContext {
pub device: AcceleratorDevice,
pub command_queues: Vec<AcceleratorQueue>,
pub memory_pools: HashMap<AcceleratorMemoryType, AcceleratorMemoryPool>,
#[allow(dead_code)] backend_context: Option<Box<dyn Any + Send + Sync>>,
}
#[derive(Debug)]
pub struct AcceleratorQueue {
pub id: u32,
pub priority: AcceleratorPriority,
pub device_id: u32,
#[allow(dead_code)] backend_queue: Option<Box<dyn Any + Send + Sync>>,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum AcceleratorPriority {
Low,
Normal,
High,
Critical,
}
#[derive(Debug)]
pub struct AcceleratorMemoryPool {
pub total_size: usize,
pub available_size: usize,
pub allocation_count: usize,
pub memory_type: AcceleratorMemoryType,
#[allow(dead_code)] backend_pool: Option<Box<dyn Any + Send + Sync>>,
}
#[derive(Debug, Clone)]
pub struct AcceleratorKernel {
pub name: String,
pub operation: AcceleratorOperation,
pub input_buffers: Vec<u32>,
pub output_buffers: Vec<u32>,
pub parameters: HashMap<String, AcceleratorParameter>,
pub work_size: (usize, usize, usize),
pub local_size: (usize, usize, usize),
}
#[derive(Debug, Clone)]
pub enum AcceleratorParameter {
Int(i64),
Float(f64),
Bool(bool),
String(String),
Array(Vec<u8>),
}
pub trait AcceleratorOperations {
fn allocate<T>(
&self,
size: usize,
memory_type: AcceleratorMemoryType,
) -> Result<AcceleratorBuffer<T>, SimdError>;
fn copy_to_accelerator<T>(
&self,
host_data: &[T],
accel_buffer: &mut AcceleratorBuffer<T>,
queue: Option<&AcceleratorQueue>,
) -> Result<(), SimdError>;
fn copy_from_accelerator<T>(
&self,
accel_buffer: &AcceleratorBuffer<T>,
host_data: &mut [T],
queue: Option<&AcceleratorQueue>,
) -> Result<(), SimdError>;
fn execute_kernel(
&self,
kernel: &AcceleratorKernel,
buffers: &[&AcceleratorBuffer<u8>],
queue: Option<&AcceleratorQueue>,
) -> Result<(), SimdError>;
fn synchronize(&self, queue: Option<&AcceleratorQueue>) -> Result<(), SimdError>;
fn get_status(&self) -> Result<AcceleratorStatus, SimdError>;
}
#[derive(Debug, Clone)]
pub struct AcceleratorStatus {
pub utilization_percent: f64,
pub memory_usage_percent: f64,
pub temperature_c: f64,
pub power_consumption_w: f64,
pub clock_frequency_mhz: f64,
pub active_operations: Vec<AcceleratorOperation>,
pub error_count: u32,
}
pub struct AcceleratorRuntime {
devices: Vec<AcceleratorDevice>,
contexts: Vec<AcceleratorContext>,
drivers: HashMap<AcceleratorType, Box<dyn AcceleratorDriver>>,
}
pub trait AcceleratorDriver: Send + Sync {
fn initialize(&self) -> Result<(), SimdError>;
fn discover_devices(&self) -> Result<Vec<AcceleratorDevice>, SimdError>;
fn create_context(&self, device: &AcceleratorDevice) -> Result<AcceleratorContext, SimdError>;
fn is_available(&self) -> bool;
}
impl AcceleratorRuntime {
pub fn new() -> Result<Self, SimdError> {
let mut runtime = Self {
devices: Vec::new(),
contexts: Vec::new(),
drivers: HashMap::new(),
};
runtime.register_driver(AcceleratorType::ASIC, Box::new(AsicDriver::new()));
runtime.register_driver(AcceleratorType::DSP, Box::new(DspDriver::new()));
runtime.register_driver(AcceleratorType::VPU, Box::new(VpuDriver::new()));
runtime.register_driver(AcceleratorType::NPU, Box::new(NpuDriver::new()));
runtime.discover_all_devices()?;
Ok(runtime)
}
pub fn register_driver(
&mut self,
accel_type: AcceleratorType,
driver: Box<dyn AcceleratorDriver>,
) {
self.drivers.insert(accel_type, driver);
}
fn discover_all_devices(&mut self) -> Result<(), SimdError> {
for driver in self.drivers.values() {
if driver.is_available() {
let devices = driver.discover_devices()?;
self.devices.extend(devices);
}
}
Ok(())
}
pub fn devices(&self) -> &[AcceleratorDevice] {
&self.devices
}
pub fn devices_by_type(&self, accel_type: AcceleratorType) -> Vec<&AcceleratorDevice> {
self.devices
.iter()
.filter(|d| d.accelerator_type == accel_type)
.collect()
}
pub fn create_context(&mut self, device_id: u32) -> Result<&AcceleratorContext, SimdError> {
let device = self.devices.get(device_id as usize).ok_or_else(|| {
SimdError::InvalidArgument("Invalid accelerator device ID".to_string())
})?;
let driver = self
.drivers
.get(&device.accelerator_type)
.ok_or_else(|| SimdError::NotImplemented("Driver not available".to_string()))?;
let context = driver.create_context(device)?;
self.contexts.push(context);
Ok(self
.contexts
.last()
.expect("collection should not be empty"))
}
pub fn get_best_device(&self, operation: AcceleratorOperation) -> Option<&AcceleratorDevice> {
self.devices
.iter()
.filter(|d| d.capabilities.supported_operations.contains(&operation))
.max_by(|a, b| {
a.capabilities
.peak_performance_ops
.cmp(&b.capabilities.peak_performance_ops)
})
}
}
struct AsicDriver;
struct DspDriver;
struct VpuDriver;
struct NpuDriver;
impl AsicDriver {
fn new() -> Self {
Self
}
}
impl AcceleratorDriver for AsicDriver {
fn initialize(&self) -> Result<(), SimdError> {
Ok(())
}
fn discover_devices(&self) -> Result<Vec<AcceleratorDevice>, SimdError> {
Ok(vec![])
}
fn create_context(&self, device: &AcceleratorDevice) -> Result<AcceleratorContext, SimdError> {
Ok(AcceleratorContext {
device: device.clone(),
command_queues: vec![],
memory_pools: HashMap::new(),
backend_context: None,
})
}
fn is_available(&self) -> bool {
false
}
}
impl DspDriver {
fn new() -> Self {
Self
}
}
impl AcceleratorDriver for DspDriver {
fn initialize(&self) -> Result<(), SimdError> {
Ok(())
}
fn discover_devices(&self) -> Result<Vec<AcceleratorDevice>, SimdError> {
Ok(vec![])
}
fn create_context(&self, device: &AcceleratorDevice) -> Result<AcceleratorContext, SimdError> {
Ok(AcceleratorContext {
device: device.clone(),
command_queues: vec![],
memory_pools: HashMap::new(),
backend_context: None,
})
}
fn is_available(&self) -> bool {
false
}
}
impl VpuDriver {
fn new() -> Self {
Self
}
}
impl AcceleratorDriver for VpuDriver {
fn initialize(&self) -> Result<(), SimdError> {
Ok(())
}
fn discover_devices(&self) -> Result<Vec<AcceleratorDevice>, SimdError> {
Ok(vec![])
}
fn create_context(&self, device: &AcceleratorDevice) -> Result<AcceleratorContext, SimdError> {
Ok(AcceleratorContext {
device: device.clone(),
command_queues: vec![],
memory_pools: HashMap::new(),
backend_context: None,
})
}
fn is_available(&self) -> bool {
false
}
}
impl NpuDriver {
fn new() -> Self {
Self
}
}
impl AcceleratorDriver for NpuDriver {
fn initialize(&self) -> Result<(), SimdError> {
Ok(())
}
fn discover_devices(&self) -> Result<Vec<AcceleratorDevice>, SimdError> {
Ok(vec![])
}
fn create_context(&self, device: &AcceleratorDevice) -> Result<AcceleratorContext, SimdError> {
Ok(AcceleratorContext {
device: device.clone(),
command_queues: vec![],
memory_pools: HashMap::new(),
backend_context: None,
})
}
fn is_available(&self) -> bool {
false
}
}
pub mod optimization {
use super::*;
pub fn optimize_kernel(
kernel: &AcceleratorKernel,
device: &AcceleratorDevice,
) -> Result<AcceleratorKernel, SimdError> {
let mut optimized = kernel.clone();
match device.accelerator_type {
AcceleratorType::NPU => {
optimized.work_size = optimize_for_npu(kernel.work_size, &device.capabilities);
}
AcceleratorType::VPU => {
optimized.work_size = optimize_for_vpu(kernel.work_size, &device.capabilities);
}
AcceleratorType::DSP => {
optimized.work_size = optimize_for_dsp(kernel.work_size, &device.capabilities);
}
_ => {
optimized.work_size = optimize_generic(kernel.work_size, &device.capabilities);
}
}
Ok(optimized)
}
fn optimize_for_npu(
work_size: (usize, usize, usize),
caps: &AcceleratorCapabilities,
) -> (usize, usize, usize) {
let optimal_batch = caps.max_batch_size.min(work_size.0);
(optimal_batch, work_size.1, work_size.2)
}
fn optimize_for_vpu(
work_size: (usize, usize, usize),
caps: &AcceleratorCapabilities,
) -> (usize, usize, usize) {
let compute_units = caps.compute_units as usize;
let optimal_x = work_size.0.div_ceil(compute_units) * compute_units;
(optimal_x, work_size.1, work_size.2)
}
fn optimize_for_dsp(
work_size: (usize, usize, usize),
_caps: &AcceleratorCapabilities,
) -> (usize, usize, usize) {
let next_power_of_2 = |n: usize| {
if n == 0 {
1
} else {
1 << (64 - (n - 1).leading_zeros())
}
};
(next_power_of_2(work_size.0), work_size.1, work_size.2)
}
fn optimize_generic(
work_size: (usize, usize, usize),
caps: &AcceleratorCapabilities,
) -> (usize, usize, usize) {
let compute_units = caps.compute_units as usize;
let optimal_size = work_size.0.div_ceil(compute_units) * compute_units;
(optimal_size, work_size.1, work_size.2)
}
pub fn select_accelerator(
operation: AcceleratorOperation,
data_size: usize,
devices: &[AcceleratorDevice],
) -> Option<&AcceleratorDevice> {
devices
.iter()
.filter(|d| d.capabilities.supported_operations.contains(&operation))
.filter(|d| d.capabilities.max_batch_size >= data_size)
.max_by(|a, b| {
let score_a = compute_device_score(a, operation, data_size);
let score_b = compute_device_score(b, operation, data_size);
score_a
.partial_cmp(&score_b)
.unwrap_or(core::cmp::Ordering::Equal)
})
}
fn compute_device_score(
device: &AcceleratorDevice,
operation: AcceleratorOperation,
data_size: usize,
) -> f64 {
let mut score = 0.0;
score += device.capabilities.peak_performance_ops as f64 / 1e9;
let memory_ratio =
data_size as f64 / (device.capabilities.memory_mb as f64 * 1024.0 * 1024.0);
score += if memory_ratio <= 1.0 {
1.0
} else {
1.0 / memory_ratio
};
let ops_per_watt = device.capabilities.peak_performance_ops as f64
/ device.capabilities.power_consumption_w;
score += ops_per_watt / 1e9;
match (operation, device.accelerator_type) {
(AcceleratorOperation::MatrixMultiply, AcceleratorType::NPU) => score += 2.0,
(AcceleratorOperation::Convolution, AcceleratorType::VPU) => score += 2.0,
(AcceleratorOperation::Transform, AcceleratorType::DSP) => score += 2.0,
_ => {}
}
score
}
}
#[allow(non_snake_case)]
#[cfg(all(test, not(feature = "no-std")))]
mod tests {
use super::*;
#[cfg(feature = "no-std")]
use alloc::{
string::{String, ToString},
vec,
vec::Vec,
};
#[test]
fn test_accelerator_runtime_creation() {
let runtime = AcceleratorRuntime::new();
assert!(runtime.is_ok());
}
#[test]
fn test_accelerator_type_display() {
let types = vec![
AcceleratorType::ASIC,
AcceleratorType::Custom,
AcceleratorType::DSP,
AcceleratorType::VPU,
AcceleratorType::NPU,
];
for accel_type in types {
println!("Accelerator type: {:?}", accel_type);
}
}
#[test]
fn test_accelerator_capabilities() {
let caps = AcceleratorCapabilities {
supported_operations: vec![
AcceleratorOperation::MatrixMultiply,
AcceleratorOperation::Convolution,
],
data_types: vec![AcceleratorDataType::F32, AcceleratorDataType::F16],
max_batch_size: 1024,
memory_mb: 8192,
compute_units: 64,
peak_performance_ops: 1000000000,
power_consumption_w: 150.0,
precision_modes: vec![AcceleratorPrecision::High, AcceleratorPrecision::Mixed],
};
assert_eq!(caps.supported_operations.len(), 2);
assert_eq!(caps.data_types.len(), 2);
assert_eq!(caps.max_batch_size, 1024);
}
#[test]
fn test_accelerator_kernel() {
let mut params = HashMap::new();
params.insert("alpha".to_string(), AcceleratorParameter::Float(1.0));
params.insert("beta".to_string(), AcceleratorParameter::Float(0.0));
let kernel = AcceleratorKernel {
name: "test_kernel".to_string(),
operation: AcceleratorOperation::MatrixMultiply,
input_buffers: vec![0, 1],
output_buffers: vec![2],
parameters: params,
work_size: (1024, 1024, 1),
local_size: (16, 16, 1),
};
assert_eq!(kernel.name, "test_kernel");
assert_eq!(kernel.operation, AcceleratorOperation::MatrixMultiply);
assert_eq!(kernel.input_buffers.len(), 2);
assert_eq!(kernel.output_buffers.len(), 1);
assert_eq!(kernel.parameters.len(), 2);
}
#[test]
fn test_accelerator_device() {
let device = AcceleratorDevice {
id: 0,
name: "Test Accelerator".to_string(),
vendor: "Test Vendor".to_string(),
model: "Test Model".to_string(),
accelerator_type: AcceleratorType::NPU,
capabilities: AcceleratorCapabilities {
supported_operations: vec![AcceleratorOperation::MatrixMultiply],
data_types: vec![AcceleratorDataType::F32],
max_batch_size: 512,
memory_mb: 4096,
compute_units: 32,
peak_performance_ops: 500000000,
power_consumption_w: 100.0,
precision_modes: vec![AcceleratorPrecision::High],
},
driver_version: "1.0.0".to_string(),
firmware_version: "2.0.0".to_string(),
pci_id: Some("1234:5678".to_string()),
numa_node: Some(0),
};
assert_eq!(device.id, 0);
assert_eq!(device.accelerator_type, AcceleratorType::NPU);
assert_eq!(device.capabilities.compute_units, 32);
}
#[test]
fn test_kernel_optimization() {
let device = AcceleratorDevice {
id: 0,
name: "Test NPU".to_string(),
vendor: "Test".to_string(),
model: "NPU-100".to_string(),
accelerator_type: AcceleratorType::NPU,
capabilities: AcceleratorCapabilities {
supported_operations: vec![AcceleratorOperation::MatrixMultiply],
data_types: vec![AcceleratorDataType::F32],
max_batch_size: 256,
memory_mb: 2048,
compute_units: 16,
peak_performance_ops: 100000000,
power_consumption_w: 50.0,
precision_modes: vec![AcceleratorPrecision::High],
},
driver_version: "1.0.0".to_string(),
firmware_version: "1.0.0".to_string(),
pci_id: None,
numa_node: None,
};
let kernel = AcceleratorKernel {
name: "test".to_string(),
operation: AcceleratorOperation::MatrixMultiply,
input_buffers: vec![0, 1],
output_buffers: vec![2],
parameters: HashMap::new(),
work_size: (512, 512, 1),
local_size: (16, 16, 1),
};
let optimized = optimization::optimize_kernel(&kernel, &device);
assert!(optimized.is_ok());
let opt_kernel = optimized.expect("operation should succeed");
assert_eq!(opt_kernel.work_size.0, 256); }
#[test]
fn test_accelerator_selection() {
let devices = vec![
AcceleratorDevice {
id: 0,
name: "NPU".to_string(),
vendor: "Test".to_string(),
model: "NPU-100".to_string(),
accelerator_type: AcceleratorType::NPU,
capabilities: AcceleratorCapabilities {
supported_operations: vec![AcceleratorOperation::MatrixMultiply],
data_types: vec![AcceleratorDataType::F32],
max_batch_size: 1024,
memory_mb: 4096,
compute_units: 32,
peak_performance_ops: 1000000000,
power_consumption_w: 100.0,
precision_modes: vec![AcceleratorPrecision::High],
},
driver_version: "1.0.0".to_string(),
firmware_version: "1.0.0".to_string(),
pci_id: None,
numa_node: None,
},
AcceleratorDevice {
id: 1,
name: "VPU".to_string(),
vendor: "Test".to_string(),
model: "VPU-200".to_string(),
accelerator_type: AcceleratorType::VPU,
capabilities: AcceleratorCapabilities {
supported_operations: vec![AcceleratorOperation::Convolution],
data_types: vec![AcceleratorDataType::F16],
max_batch_size: 512,
memory_mb: 2048,
compute_units: 16,
peak_performance_ops: 500000000,
power_consumption_w: 50.0,
precision_modes: vec![AcceleratorPrecision::Mixed],
},
driver_version: "1.0.0".to_string(),
firmware_version: "1.0.0".to_string(),
pci_id: None,
numa_node: None,
},
];
let selected =
optimization::select_accelerator(AcceleratorOperation::MatrixMultiply, 512, &devices);
assert!(selected.is_some());
assert_eq!(selected.expect("operation should succeed").id, 0);
let selected =
optimization::select_accelerator(AcceleratorOperation::Convolution, 256, &devices);
assert!(selected.is_some());
assert_eq!(selected.expect("operation should succeed").id, 1);
}
}