pub mod arithmetic_ops;
pub mod basic_operations;
pub mod comparison_ops;
pub mod intrinsics;
pub mod math_functions;
pub mod statistics_ops;
#[allow(non_snake_case)]
#[cfg(all(test, not(feature = "no-std")))]
pub mod integration_test;
pub use arithmetic_ops::{
abs_vec, add_vec, divide_vec, fma, multiply_vec, neg_vec, reciprocal_vec, scale_vec,
square_vec, subtract_vec,
};
pub use basic_operations::{cosine_similarity, dot_product, euclidean_distance, norm_l1, norm_l2};
pub use comparison_ops::{
and_vec, eq_vec, ge_vec, gt_vec, le_vec, lt_vec, ne_vec, not_vec, or_vec, xor_vec,
};
pub use intrinsics::{
detect_simd_capabilities, optimal_chunk_size, simd_width_f32, F32x4, SimdCapabilities,
};
pub use math_functions::{cos_vec, exp_vec, ln_vec, pow_vec, sin_vec, sqrt_vec, tan_vec};
pub use statistics_ops::{
dot_product as stats_dot_product, max_vec, mean_vec, min_max_vec, min_vec,
norm_l1 as stats_norm_l1, norm_l2 as stats_norm_l2, norm_l2_squared, product_vec, std_dev_vec,
sum_vec, variance_vec,
};
pub use statistics_ops::sum_vec as sum;
pub use arithmetic_ops::scale_vec_inplace as scale;
pub use statistics_ops::mean_vec as mean;
pub use statistics_ops::{min_max_vec as min_max, variance_vec as variance};
pub use basic_operations::norm_l2 as norm;
pub use arithmetic_ops::add_vec as add_simd;
pub use arithmetic_ops::fma as fma_simd;
pub use basic_operations::{cross_product, outer_product};
#[cfg(feature = "no-std")]
use alloc::vec;
#[cfg(feature = "no-std")]
use alloc::vec::Vec;
#[cfg(not(feature = "no-std"))]
use std::vec::Vec;
#[cfg(feature = "no-std")]
use core::f32::consts;
#[cfg(not(feature = "no-std"))]
use std::f32::consts;
#[derive(Debug, Clone)]
pub struct SimdConfig {
pub enable_scalar_fallback: bool,
pub simd_threshold: usize,
pub enable_accuracy_checks: bool,
}
impl Default for SimdConfig {
fn default() -> Self {
Self {
enable_scalar_fallback: true,
simd_threshold: 16, enable_accuracy_checks: cfg!(debug_assertions),
}
}
}
#[cfg(not(feature = "no-std"))]
thread_local! {
static SIMD_CONFIG: std::cell::RefCell<SimdConfig> = std::cell::RefCell::new(SimdConfig::default());
}
#[cfg(feature = "no-std")]
static mut SIMD_CONFIG: Option<SimdConfig> = None;
pub fn set_simd_config(config: SimdConfig) {
#[cfg(not(feature = "no-std"))]
{
SIMD_CONFIG.with(|c| *c.borrow_mut() = config);
}
#[cfg(feature = "no-std")]
{
unsafe {
SIMD_CONFIG = Some(config);
}
}
}
pub fn get_simd_config() -> SimdConfig {
#[cfg(not(feature = "no-std"))]
{
SIMD_CONFIG.with(|c| c.borrow().clone())
}
#[cfg(feature = "no-std")]
{
unsafe { core::ptr::addr_of!(SIMD_CONFIG).read().unwrap_or_default() }
}
}
pub struct PlatformInfo {
pub capabilities: SimdCapabilities,
pub optimal_chunk_size: usize,
pub recommended_alignment: usize,
}
pub fn detect_platform_info() -> PlatformInfo {
let capabilities = detect_simd_capabilities();
let optimal_chunk_size = optimal_chunk_size(1000, None); let recommended_alignment = intrinsics::preferred_alignment_f32();
PlatformInfo {
capabilities,
optimal_chunk_size,
recommended_alignment,
}
}
pub fn allocate_aligned_vec(size: usize, _alignment: usize) -> Vec<f32> {
vec![0.0; size]
}
pub fn is_properly_aligned(slice: &[f32], alignment: usize) -> bool {
(slice.as_ptr() as usize).is_multiple_of(alignment)
}
#[cfg(not(feature = "no-std"))]
pub mod benchmarks {
use super::*;
use std::time::{Duration, Instant};
#[derive(Debug, Clone)]
pub struct BenchmarkResult {
pub operation: String,
pub duration: Duration,
pub ops_per_sec: f64,
pub elements_per_sec: f64,
pub platform: String,
}
pub fn benchmark_operation<F>(
name: &str,
vector_size: usize,
iterations: usize,
operation: F,
) -> BenchmarkResult
where
F: Fn(),
{
for _ in 0..10 {
operation();
}
let start = Instant::now();
for _ in 0..iterations {
operation();
}
let duration = start.elapsed();
let platform_info = detect_platform_info();
let platform_name = platform_info.capabilities.platform_name();
BenchmarkResult {
operation: name.to_string(),
duration,
ops_per_sec: iterations as f64 / duration.as_secs_f64(),
elements_per_sec: (iterations * vector_size) as f64 / duration.as_secs_f64(),
platform: platform_name.to_string(),
}
}
pub fn benchmark_scaling<F>(
name: &str,
sizes: &[usize],
iterations: usize,
operation_factory: F,
) -> Vec<BenchmarkResult>
where
F: Fn(usize) -> Box<dyn Fn()>,
{
sizes
.iter()
.map(|&size| {
let operation = operation_factory(size);
benchmark_operation(name, size, iterations, operation)
})
.collect()
}
}
pub mod accuracy {
use super::*;
#[derive(Debug, Clone)]
pub struct AccuracyResult {
pub max_abs_error: f32,
pub rms_error: f32,
pub mean_abs_error: f32,
pub test_points: usize,
pub grade: AccuracyGrade,
}
#[derive(Debug, Clone, PartialEq)]
pub enum AccuracyGrade {
A, B, C, D, F, }
pub fn test_accuracy<F, R>(
approximation: F,
reference: R,
test_inputs: &[f32],
) -> AccuracyResult
where
F: Fn(&[f32], &mut [f32]),
R: Fn(f32) -> f32,
{
let mut approx_results = vec![0.0; test_inputs.len()];
approximation(test_inputs, &mut approx_results);
let mut errors = Vec::with_capacity(test_inputs.len());
let mut abs_errors = Vec::with_capacity(test_inputs.len());
for (i, &input) in test_inputs.iter().enumerate() {
let reference_result = reference(input);
let error = approx_results[i] - reference_result;
let abs_error = error.abs();
errors.push(error);
abs_errors.push(abs_error);
}
let max_abs_error = abs_errors.iter().fold(0.0f32, |a, &b| a.max(b));
let mean_abs_error = abs_errors.iter().sum::<f32>() / abs_errors.len() as f32;
let rms_error = (errors.iter().map(|&e| e * e).sum::<f32>() / errors.len() as f32).sqrt();
let grade = match max_abs_error {
e if e < 1e-6 => AccuracyGrade::A,
e if e < 1e-5 => AccuracyGrade::B,
e if e < 1e-4 => AccuracyGrade::C,
e if e < 1e-3 => AccuracyGrade::D,
_ => AccuracyGrade::F,
};
AccuracyResult {
max_abs_error,
rms_error,
mean_abs_error,
test_points: test_inputs.len(),
grade,
}
}
pub fn generate_test_inputs(
range_start: f32,
range_end: f32,
num_points: usize,
include_special_values: bool,
) -> Vec<f32> {
let mut inputs = Vec::with_capacity(num_points + 20);
let step = (range_end - range_start) / (num_points as f32);
for i in 0..num_points {
inputs.push(range_start + i as f32 * step);
}
if include_special_values {
let special_values = vec![
0.0,
-0.0,
consts::PI,
-consts::PI,
consts::PI / 2.0,
-consts::PI / 2.0,
consts::PI / 4.0,
-consts::PI / 4.0,
consts::E,
-consts::E,
1.0,
-1.0,
2.0,
-2.0,
10.0,
-10.0,
0.1,
-0.1,
0.001,
-0.001,
1e-6,
-1e-6,
];
for value in special_values {
if value >= range_start && value <= range_end {
inputs.push(value);
}
}
}
inputs.sort_by(|a, b| a.partial_cmp(b).expect("operation should succeed"));
inputs.dedup();
inputs
}
}
pub mod utils {
use super::*;
pub fn check_compatible_lengths(a: &[f32], b: &[f32]) -> Result<(), &'static str> {
if a.len() != b.len() {
Err("Vectors must have the same length")
} else {
Ok(())
}
}
pub fn check_io_lengths(input: &[f32], output: &[f32]) -> Result<(), &'static str> {
check_compatible_lengths(input, output)
}
pub fn check_not_empty(vec: &[f32]) -> Result<(), &'static str> {
if vec.is_empty() {
Err("Vector cannot be empty")
} else {
Ok(())
}
}
pub fn get_platform_chunk_size() -> usize {
detect_platform_info().optimal_chunk_size
}
pub fn chunk_vector(vec: &[f32], chunk_size: usize) -> (&[f32], &[f32]) {
let simd_len = (vec.len() / chunk_size) * chunk_size;
vec.split_at(simd_len)
}
pub fn process_chunks<F, R>(
vec: &[f32],
chunk_size: usize,
mut chunk_processor: F,
mut remainder_processor: R,
) where
F: FnMut(&[f32]),
R: FnMut(&[f32]),
{
let (chunks, remainder) = chunk_vector(vec, chunk_size);
for chunk in chunks.chunks_exact(chunk_size) {
chunk_processor(chunk);
}
if !remainder.is_empty() {
remainder_processor(remainder);
}
}
pub fn degrees_to_radians(degrees: f32) -> f32 {
degrees * consts::PI / 180.0
}
pub fn radians_to_degrees(radians: f32) -> f32 {
radians * 180.0 / consts::PI
}
pub fn safe_divide(numerator: f32, denominator: f32) -> f32 {
if denominator.abs() < f32::EPSILON {
if numerator >= 0.0 {
f32::INFINITY
} else {
f32::NEG_INFINITY
}
} else {
numerator / denominator
}
}
pub fn clamp(value: f32, min: f32, max: f32) -> f32 {
if value < min {
min
} else if value > max {
max
} else {
value
}
}
}
pub mod constants {
#[cfg(feature = "no-std")]
use core::f32::consts;
#[cfg(not(feature = "no-std"))]
use std::f32::consts;
pub const PI_F32: f32 = consts::PI;
pub const E_F32: f32 = consts::E;
pub const LN_2_F32: f32 = consts::LN_2;
pub const LN_10_F32: f32 = consts::LN_10;
pub const SQRT_2_F32: f32 = consts::SQRT_2;
pub const SSE2_VECTOR_SIZE: usize = 4; pub const AVX2_VECTOR_SIZE: usize = 8; pub const AVX512_VECTOR_SIZE: usize = 16; pub const NEON_VECTOR_SIZE: usize = 4;
pub const SSE2_ALIGNMENT: usize = 16; pub const AVX2_ALIGNMENT: usize = 32; pub const AVX512_ALIGNMENT: usize = 64; pub const NEON_ALIGNMENT: usize = 16; }
#[allow(non_snake_case)]
#[cfg(all(test, not(feature = "no-std")))]
mod tests {
use super::*;
#[cfg(feature = "no-std")]
use alloc::{vec, vec::Vec};
#[test]
fn test_simd_config() {
let config = SimdConfig::default();
set_simd_config(config.clone());
let retrieved_config = get_simd_config();
assert_eq!(retrieved_config.simd_threshold, config.simd_threshold);
assert_eq!(
retrieved_config.enable_scalar_fallback,
config.enable_scalar_fallback
);
}
#[test]
fn test_platform_detection() {
let platform_info = detect_platform_info();
assert!(platform_info.optimal_chunk_size >= 4);
assert!(platform_info.recommended_alignment >= 4);
let caps = platform_info.capabilities;
println!("SIMD Capabilities: {:?}", caps);
}
#[test]
fn test_aligned_allocation() {
let vec = allocate_aligned_vec(16, 16);
assert_eq!(vec.len(), 16);
assert_eq!(vec[0], 0.0);
}
#[test]
fn test_utils() {
use utils::*;
let a = vec![1.0, 2.0, 3.0];
let b = vec![4.0, 5.0, 6.0];
let c = vec![7.0, 8.0];
assert!(check_compatible_lengths(&a, &b).is_ok());
assert!(check_compatible_lengths(&a, &c).is_err());
let empty_vec: Vec<f32> = vec![];
assert!(check_not_empty(&empty_vec).is_err());
assert!(check_not_empty(&a).is_ok());
let vec = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0];
let (chunks, remainder) = chunk_vector(&vec, 4);
assert_eq!(chunks.len(), 8); assert_eq!(remainder.len(), 1);
assert!((degrees_to_radians(180.0) - constants::PI_F32).abs() < f32::EPSILON);
assert!((radians_to_degrees(constants::PI_F32) - 180.0).abs() < f32::EPSILON);
assert_eq!(safe_divide(10.0, 2.0), 5.0);
assert_eq!(safe_divide(10.0, 0.0), f32::INFINITY);
assert_eq!(safe_divide(-10.0, 0.0), f32::NEG_INFINITY);
assert_eq!(clamp(5.0, 1.0, 10.0), 5.0);
assert_eq!(clamp(-5.0, 1.0, 10.0), 1.0);
assert_eq!(clamp(15.0, 1.0, 10.0), 10.0);
}
#[test]
fn test_accuracy_grading() {
use accuracy::AccuracyGrade;
let grade_a = AccuracyGrade::A;
let grade_f = AccuracyGrade::F;
assert!(grade_a != grade_f);
assert_eq!(grade_a, AccuracyGrade::A);
}
}
#[allow(non_snake_case)]
#[cfg(all(test, not(feature = "no-std")))]
mod integration_tests {
use super::*;
#[cfg(feature = "no-std")]
use alloc::{vec, vec::Vec};
#[test]
fn test_basic_workflow() {
let a = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
let b = [8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0];
assert_eq!(a.len(), b.len());
assert_eq!(a.len(), 8);
}
#[test]
fn test_platform_optimization_paths() {
let platform_info = detect_platform_info();
println!("SIMD capabilities: {:?}", platform_info.capabilities);
println!(
"Platform name: {}",
platform_info.capabilities.platform_name()
);
println!("Optimal chunk size: {}", platform_info.optimal_chunk_size);
println!(
"Recommended alignment: {}",
platform_info.recommended_alignment
);
assert!(platform_info.optimal_chunk_size >= 1);
assert!(platform_info.recommended_alignment >= 4);
}
}