use crate::{Module, ModuleBase, Parameter};
use std::collections::HashMap;
use torsh_core::error::Result;
#[cfg(not(feature = "cuda"))]
use torsh_core::error::TorshError;
use torsh_tensor::{creation::*, Tensor};
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct HardwareCapabilities {
pub has_avx2: bool,
pub has_avx512: bool,
pub has_neon: bool,
pub has_cuda: bool,
pub has_rocm: bool,
pub has_metal: bool,
pub has_tensor_cores: bool,
pub num_cores: usize,
pub l1_cache_size: usize,
pub l2_cache_size: usize,
pub l3_cache_size: usize,
}
impl HardwareCapabilities {
pub fn detect() -> Self {
#[cfg(target_arch = "x86_64")]
let (has_avx2, has_avx512) = {
#[cfg(target_feature = "avx2")]
let avx2 = true;
#[cfg(not(target_feature = "avx2"))]
let avx2 = is_x86_feature_detected!("avx2");
#[cfg(target_feature = "avx512f")]
let avx512 = true;
#[cfg(not(target_feature = "avx512f"))]
let avx512 = is_x86_feature_detected!("avx512f");
(avx2, avx512)
};
#[cfg(not(target_arch = "x86_64"))]
let (has_avx2, has_avx512) = (false, false);
#[cfg(target_arch = "aarch64")]
let has_neon = {
#[cfg(target_feature = "neon")]
{
true
}
#[cfg(not(target_feature = "neon"))]
{
true
}
};
#[cfg(not(target_arch = "aarch64"))]
let has_neon = false;
#[cfg(feature = "cuda")]
let has_cuda = true;
#[cfg(not(feature = "cuda"))]
let has_cuda = false;
#[cfg(feature = "rocm")]
let has_rocm = true;
#[cfg(not(feature = "rocm"))]
let has_rocm = false;
#[cfg(all(target_vendor = "apple", feature = "metal"))]
let has_metal = true;
#[cfg(not(all(target_vendor = "apple", feature = "metal")))]
let has_metal = false;
let has_tensor_cores = has_cuda;
let num_cores = num_cpus::get();
let (l1_cache_size, l2_cache_size, l3_cache_size) = {
#[cfg(target_arch = "x86_64")]
{
(32 * 1024, 256 * 1024, 8 * 1024 * 1024) }
#[cfg(target_arch = "aarch64")]
{
(64 * 1024, 512 * 1024, 4 * 1024 * 1024) }
#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
{
(32 * 1024, 256 * 1024, 2 * 1024 * 1024) }
};
Self {
has_avx2,
has_avx512,
has_neon,
has_cuda,
has_rocm,
has_metal,
has_tensor_cores,
num_cores,
l1_cache_size,
l2_cache_size,
l3_cache_size,
}
}
pub fn simd_width(&self) -> usize {
if self.has_avx512 {
16 } else if self.has_avx2 {
8 } else if self.has_neon {
4 } else {
1 }
}
pub fn matrix_tile_size(&self) -> usize {
let target_bytes = self.l1_cache_size / 3;
let tile = (target_bytes / 4).isqrt();
let simd = self.simd_width();
(tile / simd) * simd
}
}
impl Default for HardwareCapabilities {
fn default() -> Self {
Self::detect()
}
}
#[derive(Debug, Clone)]
pub struct HardwareContext {
capabilities: HardwareCapabilities,
prefer_gpu: bool,
force_cpu: bool,
tile_size_override: Option<usize>,
}
impl HardwareContext {
pub fn auto_detect() -> Self {
Self {
capabilities: HardwareCapabilities::detect(),
prefer_gpu: false,
force_cpu: false,
tile_size_override: None,
}
}
pub fn cpu_only() -> Self {
Self {
capabilities: HardwareCapabilities::detect(),
prefer_gpu: false,
force_cpu: true,
tile_size_override: None,
}
}
pub fn gpu_preferred() -> Self {
Self {
capabilities: HardwareCapabilities::detect(),
prefer_gpu: true,
force_cpu: false,
tile_size_override: None,
}
}
pub fn with_tile_size(mut self, size: usize) -> Self {
self.tile_size_override = Some(size);
self
}
pub fn tile_size(&self) -> usize {
self.tile_size_override
.unwrap_or_else(|| self.capabilities.matrix_tile_size())
}
pub fn use_gpu(&self) -> bool {
!self.force_cpu
&& self.prefer_gpu
&& (self.capabilities.has_cuda
|| self.capabilities.has_rocm
|| self.capabilities.has_metal)
}
pub fn simd_width(&self) -> usize {
self.capabilities.simd_width()
}
pub fn capabilities(&self) -> &HardwareCapabilities {
&self.capabilities
}
}
impl Default for HardwareContext {
fn default() -> Self {
Self::auto_detect()
}
}
#[derive(Debug)]
pub struct HardwareLinear {
base: ModuleBase,
in_features: usize,
out_features: usize,
use_bias: bool,
context: HardwareContext,
}
impl HardwareLinear {
pub fn new(
in_features: usize,
out_features: usize,
use_bias: bool,
context: &HardwareContext,
) -> Result<Self> {
let mut base = ModuleBase::new();
let weight = crate::init::kaiming_uniform(&[in_features, out_features], "fan_in")?;
base.register_parameter("weight".to_string(), Parameter::new(weight));
if use_bias {
let bias = zeros(&[out_features])?;
base.register_parameter("bias".to_string(), Parameter::new(bias));
}
Ok(Self {
base,
in_features,
out_features,
use_bias,
context: context.clone(),
})
}
pub fn forward(&self, input: &Tensor) -> Result<Tensor> {
if self.context.use_gpu() {
self.forward_gpu(input)
} else if self.context.capabilities().has_avx512 {
self.forward_avx512(input)
} else if self.context.capabilities().has_avx2 {
self.forward_avx2(input)
} else if self.context.capabilities().has_neon {
self.forward_neon(input)
} else {
self.forward_generic(input)
}
}
#[cfg(feature = "cuda")]
fn forward_gpu(&self, input: &Tensor) -> Result<Tensor> {
let weight = self.base.parameters["weight"].tensor().read().clone();
let bias_opt = if self.use_bias {
Some(self.base.parameters["bias"].tensor().read().clone())
} else {
None
};
crate::functional::linear(input, &weight, bias_opt.as_ref())
}
#[cfg(not(feature = "cuda"))]
fn forward_gpu(&self, _input: &Tensor) -> Result<Tensor> {
Err(TorshError::Other(
"GPU support not enabled (cuda feature required)".to_string(),
))
}
#[cfg(target_arch = "x86_64")]
fn forward_avx512(&self, input: &Tensor) -> Result<Tensor> {
#[cfg(feature = "simd")]
{
let weight = self.base.parameters["weight"].tensor().read().clone();
let result = input.matmul(&weight)?;
if self.use_bias {
let bias = self.base.parameters["bias"].tensor().read().clone();
result.add(&bias)
} else {
Ok(result)
}
}
#[cfg(not(feature = "simd"))]
{
self.forward_generic(input)
}
}
#[cfg(not(target_arch = "x86_64"))]
fn forward_avx512(&self, input: &Tensor) -> Result<Tensor> {
self.forward_generic(input)
}
#[cfg(target_arch = "x86_64")]
fn forward_avx2(&self, input: &Tensor) -> Result<Tensor> {
#[cfg(feature = "simd")]
{
let weight = self.base.parameters["weight"].tensor().read().clone();
let result = input.matmul(&weight)?;
if self.use_bias {
let bias = self.base.parameters["bias"].tensor().read().clone();
result.add(&bias)
} else {
Ok(result)
}
}
#[cfg(not(feature = "simd"))]
{
self.forward_generic(input)
}
}
#[cfg(not(target_arch = "x86_64"))]
fn forward_avx2(&self, input: &Tensor) -> Result<Tensor> {
self.forward_generic(input)
}
#[cfg(target_arch = "aarch64")]
fn forward_neon(&self, input: &Tensor) -> Result<Tensor> {
#[cfg(feature = "simd")]
{
let weight = self.base.parameters["weight"].tensor().read().clone();
let result = input.matmul(&weight)?;
if self.use_bias {
let bias = self.base.parameters["bias"].tensor().read().clone();
result.add(&bias)
} else {
Ok(result)
}
}
#[cfg(not(feature = "simd"))]
{
self.forward_generic(input)
}
}
#[cfg(not(target_arch = "aarch64"))]
fn forward_neon(&self, _input: &Tensor) -> Result<Tensor> {
self.forward_generic(_input)
}
fn forward_generic(&self, input: &Tensor) -> Result<Tensor> {
let weight = self.base.parameters["weight"].tensor().read().clone();
let bias_opt = if self.use_bias {
Some(self.base.parameters["bias"].tensor().read().clone())
} else {
None
};
crate::functional::linear(input, &weight, bias_opt.as_ref())
}
pub fn in_features(&self) -> usize {
self.in_features
}
pub fn out_features(&self) -> usize {
self.out_features
}
pub fn has_bias(&self) -> bool {
self.use_bias
}
pub fn context(&self) -> &HardwareContext {
&self.context
}
}
impl Module for HardwareLinear {
fn forward(&self, input: &Tensor) -> Result<Tensor> {
self.forward(input)
}
fn parameters(&self) -> HashMap<String, Parameter> {
self.base.parameters.clone()
}
fn named_parameters(&self) -> HashMap<String, Parameter> {
self.base.parameters.clone()
}
fn train(&mut self) {
self.base.set_training(true);
}
fn eval(&mut self) {
self.base.set_training(false);
}
fn training(&self) -> bool {
self.base.training()
}
}
pub fn print_hardware_info() {
let caps = HardwareCapabilities::detect();
println!("=== Hardware Capabilities ===");
println!("CPU:");
println!(" Cores: {}", caps.num_cores);
println!(" AVX2: {}", caps.has_avx2);
println!(" AVX-512: {}", caps.has_avx512);
println!(" NEON: {}", caps.has_neon);
println!(" SIMD Width: {} floats", caps.simd_width());
println!("Cache:");
println!(" L1: {} KB", caps.l1_cache_size / 1024);
println!(" L2: {} KB", caps.l2_cache_size / 1024);
println!(" L3: {} KB", caps.l3_cache_size / 1024);
println!(" Recommended tile size: {}", caps.matrix_tile_size());
println!("GPU:");
println!(" CUDA: {}", caps.has_cuda);
println!(" ROCm: {}", caps.has_rocm);
println!(" Metal: {}", caps.has_metal);
println!(" Tensor Cores: {}", caps.has_tensor_cores);
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_hardware_detection() {
let caps = HardwareCapabilities::detect();
assert!(caps.num_cores > 0);
assert!(caps.simd_width() >= 1);
assert!(caps.simd_width() <= 16);
assert!(caps.l1_cache_size > 0);
assert!(caps.l2_cache_size >= caps.l1_cache_size);
}
#[test]
fn test_hardware_context() {
let ctx = HardwareContext::auto_detect();
assert!(ctx.tile_size() > 0);
assert!(ctx.simd_width() >= 1);
let cpu_ctx = HardwareContext::cpu_only();
assert!(!cpu_ctx.use_gpu());
let gpu_ctx = HardwareContext::gpu_preferred();
let _ = gpu_ctx.use_gpu();
}
#[test]
fn test_hardware_linear_creation() {
let ctx = HardwareContext::auto_detect();
let layer = HardwareLinear::new(10, 5, true, &ctx);
assert!(layer.is_ok());
let layer = layer.unwrap();
assert_eq!(layer.in_features(), 10);
assert_eq!(layer.out_features(), 5);
assert!(layer.has_bias());
}
#[test]
fn test_hardware_linear_forward() {
let ctx = HardwareContext::cpu_only(); let layer = HardwareLinear::new(10, 5, true, &ctx).unwrap();
let input = randn(&[2, 10]).unwrap();
let output = layer.forward(&input);
assert!(output.is_ok());
let output = output.unwrap();
assert_eq!(output.shape().dims(), &[2, 5]);
}
#[test]
fn test_custom_tile_size() {
let ctx = HardwareContext::auto_detect().with_tile_size(64);
assert_eq!(ctx.tile_size(), 64);
}
#[test]
fn test_simd_width_bounds() {
let caps = HardwareCapabilities::detect();
let width = caps.simd_width();
assert!(width == 1 || width == 2 || width == 4 || width == 8 || width == 16);
}
}