pub mod accelerator;
pub mod custom_asic;
pub mod device_manager;
pub mod fpga;
pub mod kernel_compiler;
pub mod memory_mapping;
pub mod model_partitioning;
pub mod partial_reconfiguration;
pub use accelerator::{Accelerator, AcceleratorCapabilities, AcceleratorType};
pub use custom_asic::{ASICConfig, ASICOperation, CustomASIC, DataType, NativeOperation};
pub use device_manager::{DeviceInfo, DeviceManager, DeviceSelector};
pub use fpga::{FPGAConfig, FPGADevice, FPGAKernel};
pub use kernel_compiler::{CompilationTarget, KernelCompiler, OptimizationLevel};
pub use memory_mapping::{BufferAllocation, MemoryLayout, MemoryMapRequirements, MemoryMapper};
pub use model_partitioning::{
LayerProfile, ModelPartition, ModelPartitioner, PartitioningStrategy,
};
pub use partial_reconfiguration::{
DPRManager, PartialBitstream, PartialRegion, ReconfigurationState,
use crate::error::Result;
use scirs2_core::ndarray::prelude::*;
use std::sync::Arc;
#[derive(Debug, Clone)]
pub struct HardwareConfig {
pub device_type: AcceleratorType,
pub device_id: usize,
pub memory_strategy: MemoryStrategy,
pub optimization_level: OptimizationLevel,
pub enable_kernel_fusion: bool,
pub enable_layout_optimization: bool,
pub max_batch_size: usize,
pub precision_mode: PrecisionMode,
}
impl Default for HardwareConfig {
fn default() -> Self {
Self {
device_type: AcceleratorType::CPU,
device_id: 0,
memory_strategy: MemoryStrategy::Automatic,
optimization_level: OptimizationLevel::O2,
enable_kernel_fusion: true,
enable_layout_optimization: true,
max_batch_size: 256,
precision_mode: PrecisionMode::Mixed,
}
}
#[derive(Debug, Clone, Copy, PartialEq)]
pub enum MemoryStrategy {
Automatic,
Preallocated,
OnDemand,
PoolBased(usize),
pub enum PrecisionMode {
Full,
Half,
Mixed,
Quantized,
Binary,
pub trait HardwareLayer: Send + Sync {
fn compile(&mut self, device: &dyn Accelerator, config: &HardwareConfig) -> Result<()>;
fn forward_hardware(
&self,
input: &ArrayView2<f32>,
device: &dyn Accelerator,
) -> Result<Array2<f32>>;
fn backward_hardware(
grad_output: &ArrayView2<f32>,
fn memory_requirements(&self) -> MemoryRequirements;
fn is_compiled(&self) -> bool;
pub struct MemoryRequirements {
pub input_size: usize,
pub output_size: usize,
pub weight_size: usize,
pub workspace_size: usize,
pub alignment: usize,
pub struct HardwareContext {
device_manager: DeviceManager,
active_device: Arc<dyn Accelerator>,
memory_mapper: MemoryMapper,
kernel_compiler: KernelCompiler,
config: HardwareConfig,
impl HardwareContext {
pub fn new(config: HardwareConfig) -> Result<Self> {
let device_manager = DeviceManager::new()?;
let active_device = device_manager.get_device(_config.device_type, config.device_id)?;
let memory_mapper = MemoryMapper::new(active_device.clone(), config.memory_strategy)?;
let kernel_compiler = KernelCompiler::new(_config.optimization_level);
Ok(Self {
device_manager,
active_device,
memory_mapper,
kernel_compiler,
config,
})
pub fn list_devices(&self) -> Vec<DeviceInfo> {
self.device_manager.list_devices()
pub fn switch_device(&mut self, device_type: AcceleratorType, deviceid: usize) -> Result<()> {
self.active_device = self.device_manager.get_device(device_type, device_id)?;
self.memory_mapper =
MemoryMapper::new(self.active_device.clone(), self.config.memory_strategy)?;
self.config.device_type = device_type;
self.config.device_id = device_id;
Ok(())
pub fn compile_model(&mut self, model: &mut dyn HardwareModel) -> Result<()> {
model.compile(&*self.active_device, &self.config)?;
pub fn execute_model(
model: &dyn HardwareModel,
) -> Result<Array2<f32>> {
if !model.is_compiled() {
return Err(crate::error::NeuralError::InvalidArgument(
"Model must be compiled before execution".to_string(),
));
model.forward_hardware(input, &*self.active_device)
pub fn memory_stats(&self) -> MemoryStatistics {
self.memory_mapper.get_statistics()
pub fn optimize_memory_layout(&mut self, model: &dyn HardwareModel) -> Result<()> {
if !self.config.enable_layout_optimization {
return Ok(());
let requirements = model.memory_requirements();
self.memory_mapper.optimize_layout(&requirements)?;
pub trait HardwareModel: Send + Sync {
fn memory_requirements(&self) -> Vec<MemoryRequirements>;
fn statistics(&self) -> ModelStatistics;
pub struct ModelStatistics {
pub total_params: usize,
pub total_flops: usize,
pub memory_bandwidth: usize,
pub compute_intensity: f32,
pub estimated_latency: f32,
pub struct MemoryStatistics {
pub allocated: usize,
pub used: usize,
pub peak: usize,
pub num_allocations: usize,
pub fragmentation: f32,
pub struct KernelFusion {
enabled: bool,
fusion_threshold: usize,
max_fusion_depth: usize,
impl KernelFusion {
pub fn new(enabled: bool) -> Self {
enabled,
fusion_threshold: 2,
max_fusion_depth: 5,
pub fn optimize_kernels(&self, kernels: Vec<KernelDescriptor>) -> Result<Vec<FusedKernel>> {
if !self._enabled || kernels.len() < self.fusion_threshold {
return Ok(kernels
.into_iter()
.map(|k| FusedKernel {
kernels: vec![k],
fusion_type: FusionType::None,
})
.collect());
let mut fused = Vec::new();
let mut i = 0;
while i < kernels.len() {
if i + 1 < kernels.len() && self.can_fuse(&kernels[i], &kernels[i + 1]) {
let mut fusion_group = vec![kernels[i].clone(), kernels[i + 1].clone()];
i += 2;
while i < kernels.len()
&& fusion_group.len() < self.max_fusion_depth
&& self.can_fuse(fusion_group.last().expect("Operation failed"), &kernels[i])
{
fusion_group.push(kernels[i].clone());
i += 1;
}
fused.push(FusedKernel {
kernels: fusion_group,
fusion_type: FusionType::ElementWise,
});
} else {
kernels: vec![kernels[i].clone()],
i += 1;
}
Ok(fused)
fn can_fuse(&self, kernel1: &KernelDescriptor, kernel2: &KernelDescriptor) -> bool {
kernel1.operation_type.is_element_wise()
&& kernel2.operation_type.is_element_wise()
&& kernel1.outputshape == kernel2.inputshape
pub struct KernelDescriptor {
pub name: String,
pub operation_type: OperationType,
pub inputshape: Vec<usize>,
pub outputshape: Vec<usize>,
pub memory_access_pattern: MemoryAccessPattern,
#[derive(Debug, Clone, PartialEq)]
pub enum OperationType {
MatMul,
Conv2D,
ElementWise(ElementWiseOp),
Reduction(ReductionOp),
Reshape,
Transpose,
impl OperationType {
fn is_element_wise(&self) -> bool {
matches!(self, OperationType::ElementWise(_))
pub enum ElementWiseOp {
Add,
Multiply,
ReLU,
Sigmoid,
Tanh,
pub enum ReductionOp {
Sum,
Mean,
Max,
Min,
pub enum MemoryAccessPattern {
Sequential,
Strided(usize),
Random,
Tiled(usize, usize),
pub struct FusedKernel {
pub kernels: Vec<KernelDescriptor>,
pub fusion_type: FusionType,
pub enum FusionType {
None,
ElementWise,
ConvBiasReLU,
MatMulBiasActivation,
Custom(String),
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_hardware_config_default() {
let config = HardwareConfig::default();
assert_eq!(config.device_type, AcceleratorType::CPU);
assert_eq!(config.optimization_level, OptimizationLevel::O2);
assert!(config.enable_kernel_fusion);
fn test_kernel_fusion() {
let fusion = KernelFusion::new(true);
let kernels = vec![
KernelDescriptor {
name: "add".to_string(),
operation_type: OperationType::ElementWise(ElementWiseOp::Add),
inputshape: vec![32, 64],
outputshape: vec![32, 64],
memory_access_pattern: MemoryAccessPattern::Sequential,
},
name: "relu".to_string(),
operation_type: OperationType::ElementWise(ElementWiseOp::ReLU),
];
let fused = fusion.optimize_kernels(kernels).expect("Operation failed");
assert_eq!(fused.len(), 1);
assert_eq!(fused[0].kernels.len(), 2);
assert_eq!(fused[0].fusion_type, FusionType::ElementWise);