1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156
//! # In One Sentence //! //! You want to use [`std::simd`](https://github.com/rust-lang-nursery/packed_simd/) but realized there is no simple, safe and fast way to align your `f32x8` (and friends) in memory _and_ treat them as regular `f32` slices for easy loading and manipulation; `simd_aligned` to the rescue. //! //! //! //! # Highlights //! //! * built on top of [`std::simd`](https://github.com/rust-lang-nursery/packed_simd/) for easy data handling //! * supports everything from `u8x2` to `f64x8` //! * think in flat slices (`&[f32]`), but get performance of properly aligned SIMD vectors (`&[f32x16]`) //! * defines `u8s`, ..., `f36s` as "best guess" for current platform (WIP) //! * provides N-dimensional [`VectorD`] and NxM-dimensional [`MatrixD`]. //! //! //! **Note**: Right now this is an experimental crate. Features might be added or removed depending on how [`std::simd`](https://github.com/rust-lang-nursery/packed_simd/) evolves. At the end of the day it's just about being able to load and manipulate data without much fuzz. //! //! //! # Examples //! //! Produces a vector that can hold `10` elements of type `f64`. Might internally //! allocate `5` elements of type `f64x2`, or `3` of type `f64x4`, depending on the platform. //! All elements are guaranteed to be properly aligned for fast access. //! //! ```rust //! use packed_simd::*; //! use simd_aligned::*; //! //! // Create vectors of `10` f64 elements with value `0.0`. //! let mut v1 = VectorD::<f64s>::with(0.0, 10); //! let mut v2 = VectorD::<f64s>::with(0.0, 10); //! //! // Get "flat", mutable view of the vector, and set individual elements: //! let v1_m = v1.flat_mut(); //! let v2_m = v2.flat_mut(); //! //! // Set some elements on v1 //! v1_m[0] = 0.0; //! v1_m[4] = 4.0; //! v1_m[8] = 8.0; //! //! // Set some others on v2 //! v2_m[1] = 0.0; //! v2_m[5] = 5.0; //! v2_m[9] = 9.0; //! //! let mut sum = f64s::splat(0.0); //! //! // Eventually, do something with the actual SIMD types. Does //! // `std::simd` vector math, e.g., f64x8 + f64x8 in one operation: //! sum = v1[0] + v2[0]; //! ``` //! //! # Benchmarks //! //! There is no performance penalty for using `simd_aligned`, while retaining all the //! simplicity of handling flat arrays. //! //! ```ignore //! test vectors::packed ... bench: 77 ns/iter (+/- 4) //! test vectors::scalar ... bench: 1,177 ns/iter (+/- 464) //! test vectors::simd_aligned ... bench: 71 ns/iter (+/- 5) //! ``` //! //! # FAQ //! //! ### How does it relate to [faster](https://github.com/AdamNiederer/faster) and [`std::simd`](https://github.com/rust-lang-nursery/packed_simd/)? //! //! * `simd_aligned` builds on top of `std::simd`. At aims to provide common, SIMD-aligned //! data structure that support simple and safe scalar access patterns. //! //! * `faster` (as of today) is really good if you already have exiting flat slices in your code //! and want operate them "full SIMD ahead". However, in particular when dealing with multiple //! slices at the same time (e.g., kernel computations) the performance impact of unaligned arrays can //! become a bit more noticeable (e.g., in the case of [ffsvm](https://github.com/ralfbiedert/ffsvm-rust/) up to 10% - 20%). #![warn(clippy::all)] // Enable ALL the warnings ... #![warn(clippy::nursery)] #![warn(clippy::pedantic)] #![warn(clippy::cargo)] #![allow(clippy::module_name_repetitions)] #![allow(clippy::module_inception)] mod conversion; mod matrix; mod packed; mod vector; pub mod arch; pub mod traits; use packed_simd::*; pub use crate::{ arch::current::*, conversion::{packed_as_flat, packed_as_flat_mut}, matrix::{AccessStrategy, Columns, MatrixD, MatrixFlat, MatrixFlatMut, Rows}, vector::VectorD, }; macro_rules! impl_simd { ($simd:ty, $element:ty, $lanes:expr, $lanestype:ty) => { impl crate::traits::Simd for $simd { type Element = $element; type LanesType = $lanestype; const LANES: usize = $lanes; fn splat(t: Self::Element) -> Self { Self::splat(t) } } }; } impl_simd!(u8x2, u8, 2, [u8; 2]); impl_simd!(u8x4, u8, 4, [u8; 4]); impl_simd!(u8x8, u8, 8, [u8; 8]); impl_simd!(u8x16, u8, 16, [u8; 16]); impl_simd!(u8x32, u8, 32, [u8; 32]); impl_simd!(i8x2, i8, 2, [i8; 2]); impl_simd!(i8x4, i8, 4, [i8; 4]); impl_simd!(i8x8, i8, 8, [i8; 8]); impl_simd!(i8x16, i8, 16, [i8; 16]); impl_simd!(i8x32, i8, 32, [i8; 32]); impl_simd!(u16x2, u16, 2, [u16; 2]); impl_simd!(u16x4, u16, 4, [u16; 4]); impl_simd!(u16x8, u16, 8, [u16; 8]); impl_simd!(u16x16, u16, 16, [u16; 16]); impl_simd!(i16x2, i16, 2, [i16; 2]); impl_simd!(i16x4, i16, 4, [i16; 4]); impl_simd!(i16x8, i16, 8, [i16; 8]); impl_simd!(i16x16, i16, 16, [i16; 16]); impl_simd!(u32x2, u32, 2, [u32; 2]); impl_simd!(u32x4, u32, 4, [u32; 4]); impl_simd!(u32x8, u32, 8, [u32; 8]); impl_simd!(i32x2, i32, 2, [i32; 2]); impl_simd!(i32x4, i32, 4, [i32; 4]); impl_simd!(i32x8, i32, 8, [i32; 8]); impl_simd!(u64x2, u64, 2, [u64; 2]); impl_simd!(u64x4, u64, 4, [u64; 4]); impl_simd!(i64x2, i64, 2, [i64; 2]); impl_simd!(i64x4, i64, 4, [i64; 4]); impl_simd!(f32x2, f32, 2, [f32; 2]); impl_simd!(f32x4, f32, 4, [f32; 4]); impl_simd!(f32x8, f32, 8, [f32; 8]); impl_simd!(f32x16, f32, 16, [f32; 16]); impl_simd!(f64x2, f64, 2, [f64; 2]); impl_simd!(f64x4, f64, 4, [f64; 4]); impl_simd!(f64x8, f64, 8, [f64; 8]);