1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
//! # In One Sentence
//!
//! You want to use [`std::simd`](https://github.com/rust-lang-nursery/packed_simd/) but realized there is no simple, safe and fast way to align your `f32x8` (and friends) in memory _and_ treat them as regular `f32` slices for easy loading and manipulation; `simd_aligned` to the rescue.
//!
//!
//!
//! # Highlights
//!
//! * built on top of [`std::simd`](https://github.com/rust-lang-nursery/packed_simd/) for easy data handling
//! * supports everything from `u8x2` to `f64x8`
//! * think in flat slices (`&[f32]`), but get performance of properly aligned SIMD vectors (`&[f32x16]`)
//! * defines `u8s`, ..., `f36s` as "best guess" for current platform (WIP)
//! * provides N-dimensional [`VectorD`] and NxM-dimensional [`MatrixD`].
//!
//!
//! **Note**: Right now this is an experimental crate. Features might be added or removed depending on how [`std::simd`](https://github.com/rust-lang-nursery/packed_simd/) evolves. At the end of the day it's just about being able to load and manipulate data without much fuzz.
//!
//!
//! # Examples
//!
//! Produces a vector that can hold `10` elements of type `f64`. Might internally
//! allocate `5` elements of type `f64x2`, or `3` of type `f64x4`, depending on the platform.
//! All elements are guaranteed to be properly aligned for fast access.
//!
//! ```rust
//! use packed_simd::*;
//! use simd_aligned::*;
//!
//! // Create vectors of `10` f64 elements with value `0.0`.
//! let mut v1 = VectorD::<f64s>::with(0.0, 10);
//! let mut v2 = VectorD::<f64s>::with(0.0, 10);
//!
//! // Get "flat", mutable view of the vector, and set individual elements:
//! let v1_m = v1.flat_mut();
//! let v2_m = v2.flat_mut();
//!
//! // Set some elements on v1
//! v1_m[0] = 0.0;
//! v1_m[4] = 4.0;
//! v1_m[8] = 8.0;
//!
//! // Set some others on v2
//! v2_m[1] = 0.0;
//! v2_m[5] = 5.0;
//! v2_m[9] = 9.0;
//!
//! let mut sum = f64s::splat(0.0);
//!
//! // Eventually, do something with the actual SIMD types. Does
//! // `std::simd` vector math, e.g., f64x8 + f64x8 in one operation:
//! sum = v1[0] + v2[0];
//! ```
//!
//! # Benchmarks
//!
//! There is no performance penalty for using `simd_aligned`, while retaining all the
//! simplicity of handling flat arrays.
//!
//! ```ignore
//! test vectors::packed       ... bench:          77 ns/iter (+/- 4)
//! test vectors::scalar       ... bench:       1,177 ns/iter (+/- 464)
//! test vectors::simd_aligned ... bench:          71 ns/iter (+/- 5)
//! ```
//!
//! # FAQ
//!
//! ### How does it relate to [faster](https://github.com/AdamNiederer/faster) and [`std::simd`](https://github.com/rust-lang-nursery/packed_simd/)?
//!
//! * `simd_aligned` builds on top of `std::simd`. At aims to provide common, SIMD-aligned
//! data structure that support simple and safe scalar access patterns.
//!
//! * `faster` (as of today) is really good if you already have exiting flat slices in your code
//! and want operate them "full SIMD ahead". However, in particular when dealing with multiple
//! slices at the same time (e.g., kernel computations) the performance impact of unaligned arrays can
//! become a bit more noticeable (e.g., in the case of [ffsvm](https://github.com/ralfbiedert/ffsvm-rust/) up to 10% - 20%).

#![warn(clippy::all)] // Enable ALL the warnings ...
#![warn(clippy::nursery)]
#![warn(clippy::pedantic)]
#![warn(clippy::cargo)]
#![allow(clippy::module_name_repetitions)]
#![allow(clippy::module_inception)]

mod conversion;
mod matrix;
mod packed;
mod vector;

pub mod arch;
pub mod traits;

use packed_simd::*;

pub use crate::{
    arch::current::*,
    conversion::{packed_as_flat, packed_as_flat_mut},
    matrix::{AccessStrategy, Columns, MatrixD, MatrixFlat, MatrixFlatMut, Rows},
    vector::VectorD,
};

macro_rules! impl_simd {
    ($simd:ty, $element:ty, $lanes:expr, $lanestype:ty) => {
        impl crate::traits::Simd for $simd {
            type Element = $element;
            type LanesType = $lanestype;

            const LANES: usize = $lanes;

            fn splat(t: Self::Element) -> Self { Self::splat(t) }
        }
    };
}
impl_simd!(u8x2, u8, 2, [u8; 2]);
impl_simd!(u8x4, u8, 4, [u8; 4]);
impl_simd!(u8x8, u8, 8, [u8; 8]);
impl_simd!(u8x16, u8, 16, [u8; 16]);
impl_simd!(u8x32, u8, 32, [u8; 32]);

impl_simd!(i8x2, i8, 2, [i8; 2]);
impl_simd!(i8x4, i8, 4, [i8; 4]);
impl_simd!(i8x8, i8, 8, [i8; 8]);
impl_simd!(i8x16, i8, 16, [i8; 16]);
impl_simd!(i8x32, i8, 32, [i8; 32]);

impl_simd!(u16x2, u16, 2, [u16; 2]);
impl_simd!(u16x4, u16, 4, [u16; 4]);
impl_simd!(u16x8, u16, 8, [u16; 8]);
impl_simd!(u16x16, u16, 16, [u16; 16]);

impl_simd!(i16x2, i16, 2, [i16; 2]);
impl_simd!(i16x4, i16, 4, [i16; 4]);
impl_simd!(i16x8, i16, 8, [i16; 8]);
impl_simd!(i16x16, i16, 16, [i16; 16]);

impl_simd!(u32x2, u32, 2, [u32; 2]);
impl_simd!(u32x4, u32, 4, [u32; 4]);
impl_simd!(u32x8, u32, 8, [u32; 8]);

impl_simd!(i32x2, i32, 2, [i32; 2]);
impl_simd!(i32x4, i32, 4, [i32; 4]);
impl_simd!(i32x8, i32, 8, [i32; 8]);

impl_simd!(u64x2, u64, 2, [u64; 2]);
impl_simd!(u64x4, u64, 4, [u64; 4]);

impl_simd!(i64x2, i64, 2, [i64; 2]);
impl_simd!(i64x4, i64, 4, [i64; 4]);

impl_simd!(f32x2, f32, 2, [f32; 2]);
impl_simd!(f32x4, f32, 4, [f32; 4]);
impl_simd!(f32x8, f32, 8, [f32; 8]);
impl_simd!(f32x16, f32, 16, [f32; 16]);

impl_simd!(f64x2, f64, 2, [f64; 2]);
impl_simd!(f64x4, f64, 4, [f64; 4]);
impl_simd!(f64x8, f64, 8, [f64; 8]);