use crate::error::{RusTorchError, RusTorchResult};
use std::collections::HashMap;
#[derive(Debug, Clone, Copy, PartialEq)]
pub enum AcceleratorType {
CPU,
CUDA,
ROCm,
OneAPI,
Metal,
Custom,
}
#[derive(Debug, Clone)]
pub struct HardwareCapabilities {
pub accelerators: Vec<AcceleratorInfo>,
pub cpu_info: CpuInfo,
pub memory_hierarchy: MemoryHierarchy,
pub interconnect_bandwidth: f64,
pub power_budget: Option<f64>,
}
#[derive(Debug, Clone)]
pub struct AcceleratorInfo {
pub accel_type: AcceleratorType,
pub name: String,
pub compute_units: usize,
pub clock_freq: usize,
pub memory_size: usize,
pub memory_bandwidth: f64,
pub compute_capability: f64,
pub device_id: usize,
}
#[derive(Debug, Clone)]
pub struct CpuInfo {
pub vendor: String,
pub model: String,
pub physical_cores: usize,
pub logical_cores: usize,
pub base_freq: usize,
pub turbo_freq: Option<usize>,
pub extensions: Vec<String>,
}
#[derive(Debug, Clone)]
pub struct MemoryHierarchy {
pub l1_cache: usize,
pub l2_cache: usize,
pub l3_cache: Option<usize>,
pub main_memory: usize,
pub latencies: HashMap<String, usize>,
}
pub struct HardwareOptimizer {
capabilities: HardwareCapabilities,
selected_accelerator: Option<AcceleratorInfo>,
optimization_strategy: OptimizationStrategy,
}
#[derive(Debug, Clone, Copy)]
enum OptimizationStrategy {
MinimizeLatency,
MaximizeThroughput,
PowerEfficient,
Balanced,
}
impl HardwareOptimizer {
pub fn new() -> Self {
let capabilities = Self::detect_hardware();
let selected_accelerator = Self::select_best_accelerator(&capabilities);
HardwareOptimizer {
capabilities,
selected_accelerator,
optimization_strategy: OptimizationStrategy::Balanced,
}
}
fn detect_hardware() -> HardwareCapabilities {
let cpu_info = Self::detect_cpu_info();
let accelerators = Self::detect_accelerators();
let memory_hierarchy = Self::detect_memory_hierarchy();
HardwareCapabilities {
accelerators,
cpu_info,
memory_hierarchy,
interconnect_bandwidth: Self::measure_interconnect_bandwidth(),
power_budget: Self::detect_power_constraints(),
}
}
fn detect_cpu_info() -> CpuInfo {
let logical_cores = num_cpus::get();
let physical_cores = num_cpus::get_physical();
CpuInfo {
vendor: Self::get_cpu_vendor(),
model: Self::get_cpu_model(),
physical_cores,
logical_cores,
base_freq: Self::get_cpu_frequency(),
turbo_freq: Self::get_turbo_frequency(),
extensions: Self::detect_cpu_extensions(),
}
}
fn get_cpu_vendor() -> String {
#[cfg(target_arch = "x86_64")]
return "x86_64".to_string();
#[cfg(target_arch = "aarch64")]
return "ARM".to_string();
#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
"Unknown".to_string()
}
fn get_cpu_model() -> String {
#[cfg(target_arch = "x86_64")]
return "x86_64 CPU".to_string();
#[cfg(target_arch = "aarch64")]
return "ARM CPU".to_string();
#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
"Unknown CPU".to_string()
}
fn get_cpu_frequency() -> usize {
2000
}
fn get_turbo_frequency() -> Option<usize> {
None
}
fn detect_cpu_extensions() -> Vec<String> {
#[allow(unused_mut)]
let mut extensions = Vec::new();
#[cfg(target_arch = "x86_64")]
{
if is_x86_feature_detected!("sse") {
extensions.push("SSE".to_string());
}
if is_x86_feature_detected!("sse2") {
extensions.push("SSE2".to_string());
}
if is_x86_feature_detected!("sse3") {
extensions.push("SSE3".to_string());
}
if is_x86_feature_detected!("ssse3") {
extensions.push("SSSE3".to_string());
}
if is_x86_feature_detected!("sse4.1") {
extensions.push("SSE4.1".to_string());
}
if is_x86_feature_detected!("sse4.2") {
extensions.push("SSE4.2".to_string());
}
if is_x86_feature_detected!("avx") {
extensions.push("AVX".to_string());
}
if is_x86_feature_detected!("avx2") {
extensions.push("AVX2".to_string());
}
if is_x86_feature_detected!("avx512f") {
extensions.push("AVX512F".to_string());
}
if is_x86_feature_detected!("fma") {
extensions.push("FMA".to_string());
}
}
#[cfg(target_arch = "aarch64")]
{
extensions.push("NEON".to_string());
}
extensions
}
fn detect_accelerators() -> Vec<AcceleratorInfo> {
#[allow(unused_mut)]
let mut accelerators = Vec::new();
#[cfg(feature = "cuda")]
{
if let Ok(cuda_devices) = Self::detect_cuda_devices() {
accelerators.extend(cuda_devices);
}
}
#[cfg(target_os = "macos")]
{
if let Ok(metal_devices) = Self::detect_metal_devices() {
accelerators.extend(metal_devices);
}
}
accelerators
}
#[cfg(feature = "cuda")]
fn detect_cuda_devices() -> RusTorchResult<Vec<AcceleratorInfo>> {
Ok(Vec::new())
}
#[cfg(not(feature = "cuda"))]
fn detect_cuda_devices() -> RusTorchResult<Vec<AcceleratorInfo>> {
Ok(Vec::new())
}
#[cfg(target_os = "macos")]
fn detect_metal_devices() -> RusTorchResult<Vec<AcceleratorInfo>> {
Ok(vec![AcceleratorInfo {
accel_type: AcceleratorType::Metal,
name: "Apple Silicon GPU".to_string(),
compute_units: 8, clock_freq: 1300, memory_size: 8 * 1024 * 1024 * 1024, memory_bandwidth: 200.0, compute_capability: 2.6, device_id: 0,
}])
}
#[cfg(not(target_os = "macos"))]
fn detect_metal_devices() -> RusTorchResult<Vec<AcceleratorInfo>> {
Ok(Vec::new())
}
fn detect_memory_hierarchy() -> MemoryHierarchy {
let mut latencies = HashMap::new();
latencies.insert("L1".to_string(), 4);
latencies.insert("L2".to_string(), 12);
latencies.insert("L3".to_string(), 40);
latencies.insert("Main".to_string(), 100);
MemoryHierarchy {
l1_cache: 32 * 1024, l2_cache: 256 * 1024, l3_cache: Some(8 * 1024 * 1024), main_memory: Self::get_system_memory(),
latencies,
}
}
fn get_system_memory() -> usize {
#[cfg(target_os = "linux")]
{
8_usize
.saturating_mul(1024)
.saturating_mul(1024)
.saturating_mul(1024)
}
#[cfg(not(target_os = "linux"))]
{
8_usize
.saturating_mul(1024)
.saturating_mul(1024)
.saturating_mul(1024)
}
}
fn measure_interconnect_bandwidth() -> f64 {
100.0 }
fn detect_power_constraints() -> Option<f64> {
None
}
fn select_best_accelerator(capabilities: &HardwareCapabilities) -> Option<AcceleratorInfo> {
capabilities
.accelerators
.iter()
.max_by(|a, b| {
a.compute_capability
.partial_cmp(&b.compute_capability)
.unwrap_or(std::cmp::Ordering::Equal)
})
.cloned()
}
pub fn optimal_data_layout(&self, tensor_shape: &[usize]) -> DataLayout {
if let Some(accel) = &self.selected_accelerator {
match accel.accel_type {
AcceleratorType::CUDA | AcceleratorType::ROCm => {
DataLayout::RowMajor
}
AcceleratorType::Metal => {
DataLayout::Tiled
}
_ => DataLayout::RowMajor,
}
} else {
if tensor_shape.len() == 2 && tensor_shape[1] % 8 == 0 {
DataLayout::RowMajor
} else {
DataLayout::ColumnMajor
}
}
}
pub fn optimal_tile_size(&self, operation: &str) -> (usize, usize) {
let cache_size = self.capabilities.memory_hierarchy.l1_cache;
let element_size = std::mem::size_of::<f32>();
match operation {
"matmul" => {
let tile_elements = cache_size / (3 * element_size); let tile_dim = (tile_elements as f64).sqrt() as usize;
(tile_dim, tile_dim)
}
"conv2d" => {
(32, 32)
}
_ => (64, 64),
}
}
pub fn capabilities(&self) -> &HardwareCapabilities {
&self.capabilities
}
pub fn selected_accelerator(&self) -> Option<&AcceleratorInfo> {
self.selected_accelerator.as_ref()
}
}
#[derive(Debug, Clone, Copy)]
pub enum DataLayout {
RowMajor,
ColumnMajor,
Tiled,
Custom,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_hardware_detection() {
let optimizer = HardwareOptimizer::new();
let capabilities = optimizer.capabilities();
println!("CPU Information:");
println!(" Vendor: {}", capabilities.cpu_info.vendor);
println!(" Model: {}", capabilities.cpu_info.model);
println!(" Physical cores: {}", capabilities.cpu_info.physical_cores);
println!(" Logical cores: {}", capabilities.cpu_info.logical_cores);
println!(" Base frequency: {} MHz", capabilities.cpu_info.base_freq);
println!(" Extensions: {:?}", capabilities.cpu_info.extensions);
println!("\nMemory Hierarchy:");
println!(
" L1 cache: {} KB",
capabilities.memory_hierarchy.l1_cache / 1024
);
println!(
" L2 cache: {} KB",
capabilities.memory_hierarchy.l2_cache / 1024
);
if let Some(l3) = capabilities.memory_hierarchy.l3_cache {
println!(" L3 cache: {} MB", l3 / (1024 * 1024));
}
println!(
"\nAccelerators: {} detected",
capabilities.accelerators.len()
);
for accel in &capabilities.accelerators {
println!(" - {} ({})", accel.name, accel.compute_capability);
}
assert!(capabilities.cpu_info.logical_cores > 0);
}
#[test]
fn test_optimal_layouts() {
let optimizer = HardwareOptimizer::new();
let layout1 = optimizer.optimal_data_layout(&[1024, 1024]);
let layout2 = optimizer.optimal_data_layout(&[100, 7]);
println!("Optimal layout for [1024, 1024]: {:?}", layout1);
println!("Optimal layout for [100, 7]: {:?}", layout2);
}
#[test]
fn test_tile_size_calculation() {
let optimizer = HardwareOptimizer::new();
let (m, n) = optimizer.optimal_tile_size("matmul");
println!("Optimal tile size for matmul: {}x{}", m, n);
assert!(m > 0 && n > 0);
}
}