1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
#![cfg_attr(not(feature = "std"), no_std)]
#![cfg_attr(
    all(feature = "nightly", target_arch = "wasm32"),
    feature(wasm_simd, wasm_target_feature)
)]
#![cfg_attr(
    all(feature = "nightly", target_arch = "aarch64"),
    feature(stdsimd, aarch64_target_feature)
)]
//! `generic-simd` provides safe and idiomatic zero-cost abstractions for writing explicit
//! cross-platform SIMD operations.
//!
//! # Supported architectures
//! All architectures are supported via scalar fallbacks, but the following instruction sets are
//! also supported:
//! * SSE4.1 (x86/x86-64)
//! * AVX (x86/x86-64)
//! * NEON (aarch64, with `nightly` cargo feature)
//! * SIMD128 (wasm32, with `nightly` cargo feature and `simd128` target feature)
//!
//! The various architecture-specific types are available in the [`arch`](arch/index.html) module.
//!
//! # Abstractions
//! Vector abstractions are provided via the traits in the [`vector`](vector/index.html) module.
//! Generics that use these traits are able to utilize any of the supported instruction sets.
//!
//! The following example performs a vector-accelerated sum of an input slice:
//! ```
//! use generic_simd::{
//!     arch::Token,
//!     dispatch,
//!     scalar::ScalarExt,
//!     slice::SliceExt,
//!     vector::NativeVector,
//! };
//!
//! // This function provides a generic implementation for any instruction set.
//! // Here we use the "native" vector type, i.e. the widest vector directly supported by the
//! // architecture.
//! #[inline]
//! fn sum_impl<T>(token: T, input: &[f32]) -> f32
//! where
//!     T: Token,
//!     f32: ScalarExt<T> + core::iter::Sum<NativeVector<f32, T>>,
//! {
//!     // Use aligned loads in this example, which may be better on some architectures.
//!     let (start, vectors, end) = input.align_native(token);
//!
//!     // Sum across the vector lanes, plus the unaligned portions
//!     vectors.iter().copied().sum::<f32>() + start.iter().chain(end).sum::<f32>()
//! }
//!
//! // This function selects the best instruction set at runtime.
//! // The "dispatch" macro compiles this function for each supported architecture.
//! #[dispatch(token)]
//! fn sum(input: &[f32]) -> f32 {
//!     sum_impl(token, input)
//! }
//!
//! assert_eq!(sum(&[1f32; 10]), 10.);
//! ```
//!
//! # Vector shims
//! Various instruction sets provide vectors with different widths, so shims are provided to
//! create vectors of particular widths regardless of architecture.  These are available in the
//! [`shim`](shim/index.html) module.
//!
//! For example, the following function performs an [Array of Structures of Arrays](https://en.wikipedia.org/wiki/AoS_and_SoA)
//! operation using arrays of 4 `f64`s regardless of instruction set:
//! ```
//! use generic_simd::{
//!     arch::Token,
//!     dispatch,
//!     scalar::Scalar,
//!     slice::Slice,
//!     vector::{Signed, Vector, width},
//! };
//!
//! // Equivalent to an array of 4 2-dimensional coordinates,
//! // but with a vectorizable memory layout.
//! struct Coordinates {
//!     x: [f64; 4],
//!     y: [f64; 4],
//! }
//!
//! // A generic mean implementation for any instruction set.
//! fn mean_impl<T>(token: T, input: &[Coordinates]) -> (f64, f64)
//! where
//!     T: Token,
//!     f64: Scalar<T, width::W4>,
//!     <f64 as Scalar<T, width::W4>>::Vector: Signed,
//! {
//!     let mut xsum = f64::zeroed(token);
//!     let mut ysum = f64::zeroed(token);
//!
//!     for Coordinates { x, y } in input {
//!         // read the arrays into vectors
//!         xsum += x.read(token);
//!         ysum += y.read(token);
//!     }
//!
//!     // sum across the vector lanes
//!     (
//!         xsum.iter().sum::<f64>() / (input.len() * 4) as f64,
//!         ysum.iter().sum::<f64>() / (input.len() * 4) as f64,
//!     )
//! }
//!
//! // Selects the best instruction set at runtime.
//! #[dispatch(token)]
//! fn mean(input: &[Coordinates]) -> (f64, f64) {
//!     mean_impl(token, input)
//! }
//! ```

// Re-export for use from macros.
#[doc(hidden)]
pub use multiversion;

/// Multiversions a function over all supported instruction sets.
///
/// Tagging a function with `#[dispatch(token)]` creates a version of the function for each
/// supported instruction set and provides its token as `token`.
/// The best supported function variant is selected at runtime.
///
/// # Implementation
/// This attribute is a wrapper for [`multiversion`] and supports all of its
/// conditional compilation and static dispatch features.
///
/// # Example
/// ```
/// use generic_simd::slice::SliceExt;
///
/// #[generic_simd::dispatch(token)]
/// pub fn add_one(x: &mut [f32]) {
///     let (start, vecs, end) = x.align_native_mut(token);
///     for s in start.iter_mut().chain(end.iter_mut()) {
///         *s += 1.;
///     }
///
///     for v in vecs {
///         *v += 1.;
///     }
/// }
///
/// #[generic_simd::dispatch(_token)]
/// pub fn add_two(x: &mut [f32]) {
///     // Static dispatching provided by `multiversion`.
///     // This does not perform runtime feature selection and allows inlining.
///     dispatch!(add_one(x));
///     dispatch!(add_one(x));
/// }
/// ```
///
/// [Abstractions]: index.html#abstractions
/// [Vector shims]: index.html#vector-shims
/// [`multiversion`]: ../multiversion/attr.multiversion.html
pub use generic_simd_macros::dispatch;

#[macro_use]
mod implementation;

pub mod alignment;
pub mod arch;
pub mod pointer;
pub mod scalar;
pub mod shim;
pub mod slice;
pub mod vector;