use crate::Result;
#[cfg(feature = "simd")]
use scirs2_core::parallel_ops::{par_chunks, par_join, par_scope};
pub mod activation_functions;
pub mod basic_ops;
pub mod benchmarks;
pub mod capabilities;
pub mod math_functions;
pub mod matrix_ops;
pub mod reduction_ops;
pub mod advanced_kernels;
pub mod cache_friendly_ops;
pub mod ultra_simd_engine;
pub use activation_functions::ActivationFunctions;
pub use basic_ops::BasicOps;
pub use benchmarks::{BenchmarkResult, Benchmarks};
pub use capabilities::{Capabilities, PerformanceHints, SimdCapabilities};
pub use math_functions::MathFunctions;
pub use matrix_ops::MatrixOps;
pub use reduction_ops::ReductionOps;
pub use ultra_simd_engine::{
global_simd_engine, ConvolutionParams, CpuFeatures, ElementWiseOp, ReductionOp,
SimdEngineConfig, UltraSimdEngine,
};
pub use cache_friendly_ops::{CacheFriendlyMatMul, CacheOptimizedTensorOps, MemoryAccessPattern};
pub use advanced_kernels::{AdvancedKernelRegistry, KernelOptimizationStrategy, SpecializedKernel};
pub struct SimdOptimizer;
impl SimdOptimizer {
#[inline(always)]
pub fn add_f32_unchecked(a: &[f32], b: &[f32], result: &mut [f32]) {
BasicOps::add_f32_unchecked(a, b, result)
}
pub fn add_f32_optimized(a: &[f32], b: &[f32], result: &mut [f32]) -> Result<()> {
BasicOps::add_f32_optimized(a, b, result)
}
pub fn add_f32_auto(a: &[f32], b: &[f32], result: &mut [f32]) -> Result<()> {
BasicOps::add_f32_auto(a, b, result)
}
#[inline(always)]
pub fn mul_f32_unchecked(a: &[f32], b: &[f32], result: &mut [f32]) {
BasicOps::mul_f32_unchecked(a, b, result)
}
pub fn mul_f32_optimized(a: &[f32], b: &[f32], result: &mut [f32]) -> Result<()> {
BasicOps::mul_f32_optimized(a, b, result)
}
pub fn sub_f32_optimized(a: &[f32], b: &[f32], result: &mut [f32]) -> Result<()> {
BasicOps::sub_f32_optimized(a, b, result)
}
pub fn relu_f32_optimized(input: &[f32], output: &mut [f32]) -> Result<()> {
ActivationFunctions::relu_f32_optimized(input, output)
}
pub fn sigmoid_f32_optimized(input: &[f32], output: &mut [f32]) -> Result<()> {
ActivationFunctions::sigmoid_f32_optimized(input, output)
}
pub fn matmul_f32_blocked(
a: &[f32],
b: &[f32],
c: &mut [f32],
m: usize,
n: usize,
k: usize,
block_size: usize,
) -> Result<()> {
MatrixOps::matmul_f32_blocked(a, b, c, m, n, k, block_size)
}
pub fn dot_product_f32_optimized(a: &[f32], b: &[f32]) -> Result<f32> {
MatrixOps::dot_product_f32_optimized(a, b)
}
#[inline(always)]
pub fn sum_f32_unchecked(input: &[f32]) -> f32 {
ReductionOps::sum_f32_unchecked(input)
}
#[inline(always)]
pub fn max_f32_unchecked(input: &[f32]) -> f32 {
ReductionOps::max_f32_unchecked(input)
}
pub fn detect_capabilities() -> SimdCapabilities {
Capabilities::detect_capabilities()
}
pub fn reduce_sum_f32_optimized(input: &[f32]) -> Result<f32> {
ReductionOps::reduce_sum_f32_optimized(input)
}
pub fn reduce_min_max_f32_optimized(input: &[f32]) -> Result<(f32, f32)> {
ReductionOps::reduce_min_max_f32_optimized(input)
}
pub fn normalize_f32_optimized(input: &[f32], output: &mut [f32], eps: f32) -> Result<()> {
ReductionOps::normalize_f32_optimized(input, output, eps)
}
}