faster 0.5.2

Explicit SIMD for humans
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
// This file is part of faster, the SIMD library for humans.
// Copyright 2017 Adam Niederer <adam.niederer@gmail.com>

// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at http://mozilla.org/MPL/2.0/.

#![allow(dead_code)]

//! Vector types which aren't interpreted as SIMD vectors, for systems which
//! don't have SIMD support.
use crate::core::ops::*;
use crate::core::mem::*;
use crate::core::ptr::*;
use crate::core::fmt::*;
use crate::vecs::*;

macro_rules! impl_packed_type {
    ($el:ident, $pvec:ident, $vec:ident, $sz:expr, [$($elname:ident),+]) => {
        // Blocked by const generics (or impl {Debug, PartialEq} for [T; 64])
        #[derive(Clone, Copy, /*Debug, PartialEq*/)]
        #[allow(non_camel_case_types)]
        pub struct $vec { data: [$el; $sz] }

        // PartialEq shim until const generics arrive
        impl PartialEq<Self> for $vec {
            #[inline(always)]
            fn eq(&self, other: &Self) -> bool {
                self.data.iter().zip(other.data.iter()).fold(true, |acc, (a, b)| acc && a == b)
            }
        }

        // Debug shim until const generics arrive
        impl Debug for $vec {
            #[inline(always)]
            fn fmt(&self, f: &mut Formatter) -> Result {
                write!(f, "$vec(")?;
                for n in self.data.iter() {
                    write!(f, "{:?}, ", n)?;
                }
                write!(f, ")")?;
                Ok(())
            }
        }

        impl $vec {
            #[inline(always)]
            pub fn new($($elname: $el),*) -> $vec {
                $vec { data: [$($elname),*] }
            }

            #[inline(always)]
            pub fn len() -> i32 {
                $sz
            }

            #[inline(always)]
            pub fn splat(value: $el) -> $vec {
                $vec { data: [value; $sz] }
            }

            #[inline(always)]
            pub fn extract(self, idx: usize) -> $el {
                self.data[idx]
            }

            #[inline(always)]
            pub unsafe fn extract_unchecked(self, idx: usize) -> $el {
                // Maintain unsafe API with stdsimd
                self.data[idx]
            }

            #[inline(always)]
            pub fn replace(mut self, idx: usize, val: $el) -> $vec {
                self.data[idx] = val;
                self
            }

            #[inline(always)]
            pub unsafe fn replace_unchecked(mut self, idx: usize, val: $el) -> $vec {
                // Maintain unsafe API with stdsimd
                self.data[idx] = val;
                self
            }

            #[inline(always)]
            pub fn store(self, slice: &mut [$el], offset: usize) {
                assert!(slice.len() >= $sz);
                unsafe { self.store_unchecked(slice, offset) }
            }

            #[inline(always)]
            pub fn store_unaligned(self, slice: &mut [$el]) {
                assert!(slice.len() >= $sz);
                unsafe { self.store_unchecked(slice, 0) }
            }

            #[inline(always)]
            pub unsafe fn store_unchecked(self, slice: &mut [$el], offset: usize) {
                copy_nonoverlapping(
                    &self as *const $vec as *const u8,
                    slice[offset..].as_mut_ptr() as *mut u8,
                    size_of::<$vec>());
            }

            // TODO: Actually check alignment
            #[inline(always)]
            pub unsafe fn store_aligned_unchecked(self, slice: &mut [$el]) {
                copy_nonoverlapping(
                    &self as *const $vec as *const u8,
                    slice.as_mut_ptr() as *mut u8,
                    size_of::<$vec>());
            }

            #[inline(always)]
            pub unsafe fn store_unaligned_unchecked(self, slice: &mut [$el]) {
                copy_nonoverlapping(
                    &self as *const $vec as *const u8,
                    slice.as_mut_ptr() as *mut u8,
                    size_of::<$vec>());
            }

            #[inline(always)]
            pub fn load(slice: &[$el], offset: usize) -> $vec {
                assert!(slice.len() >= $sz);
                unsafe { $vec::load_unchecked(slice, offset) }
            }

            #[inline(always)]
            pub fn load_unaligned(slice: &[$el]) -> $vec {
                assert!(slice.len() >= $sz);
                unsafe { $vec::load_unchecked(slice, 0) }
            }

            #[inline(always)]
            pub unsafe fn load_unchecked(slice: &[$el], offset: usize) -> $vec {
                let mut x = $vec::splat(0 as $el);
                copy_nonoverlapping(
                    slice[offset..].as_ptr() as *const u8,
                    &mut x as *mut $vec as *mut u8,
                    size_of::<$vec>());
                x
            }

            // TODO: Actually check alignment
            #[inline(always)]
            pub unsafe fn load_aligned_unchecked(slice: &[$el]) -> $vec {
                let mut x = $vec::splat(0 as $el);
                copy_nonoverlapping(
                    slice.as_ptr() as *const u8,
                    &mut x as *mut $vec as *mut u8,
                    size_of::<$vec>());
                x
            }

            #[inline(always)]
            pub unsafe fn load_unaligned_unchecked(slice: &[$el]) -> $vec {
                let mut x = $vec::splat(0 as $el);
                copy_nonoverlapping(
                    slice.as_ptr() as *const u8,
                    &mut x as *mut $vec as *mut u8,
                    size_of::<$vec>());
                x
            }
        }
    }
}

macro_rules! impl_from {
    ($to:ident, $($from:ident),+) => {
        $(
            impl From<$from> for $to {
                #[inline(always)]
                fn from(f: $from) -> $to {
                    unsafe { transmute(f) }
                }
            }
        )+
    }
}

macro_rules! impl_ops {
    ($el:ty, $vec:ty, $([$trait:tt, $fn:tt, $op:tt]),*) => {
        $(
            impl $trait <Self> for $vec {
                type Output = Self;
                #[inline(always)]
                fn $fn(self, rhs: Self) -> Self::Output {
                    let mut ret = Self::splat(0 as $el);
                    for (i, (x, y)) in self.data.iter().zip(rhs.data.iter()).enumerate() {
                        ret.data[i] = x $op y;
                    }
                    ret
                }
            }
        )*
    }
}

macro_rules! impl_assignops {
    ($el:ty, $vec:ty, $([$trait:tt, $fn:tt, $op:tt]),*) => {
        $(
            impl $trait <Self> for $vec {
                #[inline(always)]
                fn $fn(&mut self, rhs: Self) {
                    for (i, y) in rhs.data.iter().enumerate() {
                        self.data[i] $op y;
                    }
                }
            }
        )*
    }
}

macro_rules! impl_cast {
    ($vec:ty, $tovec:tt, $el:ty, $name:ident) => {
        impl $vec {
            #[inline(always)]
            pub fn $name(self) -> $tovec {
                let mut ret = $tovec::splat(0 as $el);
                for (i, x) in self.data.iter().enumerate() {
                    ret.data[i] = *x as $el;
                }
                ret
            }
        }
    }
}

// "undefined" is just a string that should not match any target-feature.
impl_packed!(u8, u8s, u8x16, 1, 16, [], ["undefined"]);
impl_packed!(i8, i8s, i8x16, 1, 16, [], ["undefined"]);
impl_packed!(u16, u16s, u16x8, 2, 8, [], ["undefined"]);
impl_packed!(i16, i16s, i16x8, 2, 8, [], ["undefined"]);
impl_packed!(u32, u32s, u32x4, 4, 4, [], ["undefined"]);
impl_packed!(i32, i32s, i32x4, 4, 4, [], ["undefined"]);
impl_packed!(f32, f32s, f32x4, 4, 4, [], ["undefined"]);
impl_packed!(u64, u64s, u64x2, 8, 2, [], ["undefined"]);
impl_packed!(i64, i64s, i64x2, 8, 2, [], ["undefined"]);
impl_packed!(f64, f64s, f64x2, 8, 2, [], ["undefined"]);

impl_packed_type!(f64, f64s, f64x2, 2, [x0, x1]);
impl_packed_type!(f64, f64s, f64x4, 4, [x0, x1, x2, x3]);
impl_packed_type!(f64, f64s, f64x8, 8, [x0, x1, x2, x3, x4, x5, x6, x7]);
impl_packed_type!(u64, u64s, u64x2, 2, [x0, x1]);
impl_packed_type!(u64, u64s, u64x4, 4, [x0, x1, x2, x3]);
impl_packed_type!(u64, u64s, u64x8, 8, [x0, x1, x2, x3, x4, x5, x6, x7]);
impl_packed_type!(i64, i64s, i64x2, 2, [x0, x1]);
impl_packed_type!(i64, i64s, i64x4, 4, [x0, x1, x2, x3]);
impl_packed_type!(i64, i64s, i64x8, 8, [x0, x1, x2, x3, x4, x5, x6, x7]);
impl_packed_type!(f32, f32s, f32x4, 4, [x0, x1, x2, x3]);
impl_packed_type!(f32, f32s, f32x8, 8, [x0, x1, x2, x3, x4, x5, x6, x7]);
impl_packed_type!(f32, f32s, f32x16, 16, [x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15]);
impl_packed_type!(u32, u32s, u32x4, 4, [x0, x1, x2, x3]);
impl_packed_type!(u32, u32s, u32x8, 8, [x0, x1, x2, x3, x4, x5, x6, x7]);
impl_packed_type!(u32, u32s, u32x16, 16, [x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15]);
impl_packed_type!(i32, i32s, i32x4, 4, [x0, x1, x2, x3]);
impl_packed_type!(i32, i32s, i32x8, 8, [x0, x1, x2, x3, x4, x5, x6, x7]);
impl_packed_type!(i32, i32s, i32x16, 16, [x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15]);
impl_packed_type!(u16, u16s, u16x8, 8, [x0, x1, x2, x3, x4, x5, x6, x7]);
impl_packed_type!(u16, u16s, u16x16, 16, [x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15]);
impl_packed_type!(u16, u16s, u16x32, 32, [x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, x30, x31]);
impl_packed_type!(i16, i16s, i16x8, 8, [x0, x1, x2, x3, x4, x5, x6, x7]);
impl_packed_type!(i16, i16s, i16x16, 16, [x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15]);
impl_packed_type!(i16, i16s, i16x32, 32, [x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, x30, x31]);
impl_packed_type!(u8, u8s, u8x16, 16, [x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15]);
impl_packed_type!(u8, u8s, u8x32, 32, [x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, x30, x31]);
impl_packed_type!(u8, u8s, u8x64, 64, [x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, x30, x31, x32, x33, x34, x35, x36, x37, x38, x39, x40, x41, x42, x43, x44, x45, x46, x47, x48, x49, x50, x51, x52, x53, x54, x55, x56, x57, x58, x59, x60, x61, x62, x63]);
impl_packed_type!(i8, i8s, i8x16, 16, [x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15]);
impl_packed_type!(i8, i8s, i8x32, 32, [x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, x30, x31]);
impl_packed_type!(i8, i8s, i8x64, 64, [x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, x30, x31, x32, x33, x34, x35, x36, x37, x38, x39, x40, x41, x42, x43, x44, x45, x46, x47, x48, x49, x50, x51, x52, x53, x54, x55, x56, x57, x58, x59, x60, x61, x62, x63]);

impl_from!(u64x2, i64x2, u32x4, i32x4, u16x8, i16x8, u8x16, i8x16);
impl_from!(i64x2, u64x2, u32x4, i32x4, u16x8, i16x8, u8x16, i8x16);
impl_from!(u32x4, u64x2, i64x2, i32x4, u16x8, i16x8, u8x16, i8x16);
impl_from!(i32x4, u64x2, i64x2, u32x4, u16x8, i16x8, u8x16, i8x16);
impl_from!(u16x8, u64x2, i64x2, u32x4, i32x4, i16x8, u8x16, i8x16);
impl_from!(i16x8, u64x2, i64x2, u32x4, i32x4, u16x8, u8x16, i8x16);
impl_from!(u8x16, u64x2, i64x2, u32x4, i32x4, u16x8, i16x8, i8x16);
impl_from!(i8x16, u64x2, i64x2, u32x4, i32x4, u16x8, i16x8, u8x16);

impl_from!(u64x4, i64x4, u32x8, i32x8, u16x16, i16x16, u8x32, i8x32);
impl_from!(i64x4, u64x4, u32x8, i32x8, u16x16, i16x16, u8x32, i8x32);
impl_from!(u32x8, u64x4, i64x4, i32x8, u16x16, i16x16, u8x32, i8x32);
impl_from!(i32x8, u64x4, i64x4, u32x8, u16x16, i16x16, u8x32, i8x32);
impl_from!(u16x16, u64x4, i64x4, u32x8, i32x8, i16x16, u8x32, i8x32);
impl_from!(i16x16, u64x4, i64x4, u32x8, i32x8, u16x16, u8x32, i8x32);
impl_from!(u8x32, u64x4, i64x4, u32x8, i32x8, u16x16, i16x16, i8x32);
impl_from!(i8x32, u64x4, i64x4, u32x8, i32x8, u16x16, i16x16, u8x32);

impl_from!(u64x8, i64x8, u32x16, i32x16, u16x32, i16x32, u8x64, i8x64);
impl_from!(i64x8, u64x8, u32x16, i32x16, u16x32, i16x32, u8x64, i8x64);
impl_from!(u32x16, u64x8, i64x8, i32x16, u16x32, i16x32, u8x64, i8x64);
impl_from!(i32x16, u64x8, i64x8, u32x16, u16x32, i16x32, u8x64, i8x64);
impl_from!(u16x32, u64x8, i64x8, u32x16, i32x16, i16x32, u8x64, i8x64);
impl_from!(i16x32, u64x8, i64x8, u32x16, i32x16, u16x32, u8x64, i8x64);
impl_from!(u8x64, u64x8, i64x8, u32x16, i32x16, u16x32, i16x32, i8x64);
impl_from!(i8x64, u64x8, i64x8, u32x16, i32x16, u16x32, i16x32, u8x64);

impl_ops!(i8, i8x16, [Mul, mul, *], [Div, div, /], [Add, add, +], [Sub, sub, -],
          [Shl, shl, <<], [Shr, shr, >>], [Rem, rem, %], [BitAnd, bitand, &],
          [BitOr, bitor, |], [BitXor, bitxor, ^]);
impl_ops!(u8, u8x16, [Mul, mul, *], [Div, div, /], [Add, add, +], [Sub, sub, -],
          [Shl, shl, <<], [Shr, shr, >>], [Rem, rem, %], [BitAnd, bitand, &],
          [BitOr, bitor, |], [BitXor, bitxor, ^]);
impl_ops!(i16, i16x8, [Mul, mul, *], [Div, div, /], [Add, add, +], [Sub, sub, -],
          [Shl, shl, <<], [Shr, shr, >>], [Rem, rem, %], [BitAnd, bitand, &],
          [BitOr, bitor, |], [BitXor, bitxor, ^]);
impl_ops!(u16, u16x8, [Mul, mul, *], [Div, div, /], [Add, add, +], [Sub, sub, -],
          [Shl, shl, <<], [Shr, shr, >>], [Rem, rem, %], [BitAnd, bitand, &],
          [BitOr, bitor, |], [BitXor, bitxor, ^]);
impl_ops!(i32, i32x4, [Mul, mul, *], [Div, div, /], [Add, add, +], [Sub, sub, -],
          [Shl, shl, <<], [Shr, shr, >>], [Rem, rem, %], [BitAnd, bitand, &],
          [BitOr, bitor, |], [BitXor, bitxor, ^]);
impl_ops!(u32, u32x4, [Mul, mul, *], [Div, div, /], [Add, add, +], [Sub, sub, -],
          [Shl, shl, <<], [Shr, shr, >>], [Rem, rem, %], [BitAnd, bitand, &],
          [BitOr, bitor, |], [BitXor, bitxor, ^]);
impl_ops!(f32, f32x4, [Mul, mul, *], [Div, div, /], [Add, add, +], [Sub, sub, -]);
impl_ops!(i64, i64x2, [Mul, mul, *], [Div, div, /], [Add, add, +], [Sub, sub, -],
          [Shl, shl, <<], [Shr, shr, >>], [Rem, rem, %], [BitAnd, bitand, &],
          [BitOr, bitor, |], [BitXor, bitxor, ^]);
impl_ops!(u64, u64x2, [Mul, mul, *], [Div, div, /], [Add, add, +], [Sub, sub, -],
          [Shl, shl, <<], [Shr, shr, >>], [Rem, rem, %], [BitAnd, bitand, &],
          [BitOr, bitor, |], [BitXor, bitxor, ^]);
impl_ops!(f64, f64x2, [Mul, mul, *], [Div, div, /], [Add, add, +], [Sub, sub, -]);

impl_ops!(i8, i8x32, [Mul, mul, *], [Div, div, /], [Add, add, +], [Sub, sub, -],
          [Shl, shl, <<], [Shr, shr, >>], [Rem, rem, %], [BitAnd, bitand, &],
          [BitOr, bitor, |], [BitXor, bitxor, ^]);
impl_ops!(u8, u8x32, [Mul, mul, *], [Div, div, /], [Add, add, +], [Sub, sub, -],
          [Shl, shl, <<], [Shr, shr, >>], [Rem, rem, %], [BitAnd, bitand, &],
          [BitOr, bitor, |], [BitXor, bitxor, ^]);
impl_ops!(i16, i16x16, [Mul, mul, *], [Div, div, /], [Add, add, +], [Sub, sub, -],
          [Shl, shl, <<], [Shr, shr, >>], [Rem, rem, %], [BitAnd, bitand, &],
          [BitOr, bitor, |], [BitXor, bitxor, ^]);
impl_ops!(u16, u16x16, [Mul, mul, *], [Div, div, /], [Add, add, +], [Sub, sub, -],
          [Shl, shl, <<], [Shr, shr, >>], [Rem, rem, %], [BitAnd, bitand, &],
          [BitOr, bitor, |], [BitXor, bitxor, ^]);
impl_ops!(i32, i32x8, [Mul, mul, *], [Div, div, /], [Add, add, +], [Sub, sub, -],
          [Shl, shl, <<], [Shr, shr, >>], [Rem, rem, %], [BitAnd, bitand, &],
          [BitOr, bitor, |], [BitXor, bitxor, ^]);
impl_ops!(u32, u32x8, [Mul, mul, *], [Div, div, /], [Add, add, +], [Sub, sub, -],
          [Shl, shl, <<], [Shr, shr, >>], [Rem, rem, %], [BitAnd, bitand, &],
          [BitOr, bitor, |], [BitXor, bitxor, ^]);
impl_ops!(f32, f32x8, [Mul, mul, *], [Div, div, /], [Add, add, +], [Sub, sub, -]);
impl_ops!(i64, i64x4, [Mul, mul, *], [Div, div, /], [Add, add, +], [Sub, sub, -],
          [Shl, shl, <<], [Shr, shr, >>], [Rem, rem, %], [BitAnd, bitand, &],
          [BitOr, bitor, |], [BitXor, bitxor, ^]);
impl_ops!(u64, u64x4, [Mul, mul, *], [Div, div, /], [Add, add, +], [Sub, sub, -],
          [Shl, shl, <<], [Shr, shr, >>], [Rem, rem, %], [BitAnd, bitand, &],
          [BitOr, bitor, |], [BitXor, bitxor, ^]);
impl_ops!(f64, f64x4, [Mul, mul, *], [Div, div, /], [Add, add, +], [Sub, sub, -]);

impl_ops!(i8, i8x64, [Mul, mul, *], [Div, div, /], [Add, add, +], [Sub, sub, -],
          [Shl, shl, <<], [Shr, shr, >>], [Rem, rem, %], [BitAnd, bitand, &],
          [BitOr, bitor, |], [BitXor, bitxor, ^]);
impl_ops!(u8, u8x64, [Mul, mul, *], [Div, div, /], [Add, add, +], [Sub, sub, -],
          [Shl, shl, <<], [Shr, shr, >>], [Rem, rem, %], [BitAnd, bitand, &],
          [BitOr, bitor, |], [BitXor, bitxor, ^]);
impl_ops!(i16, i16x32, [Mul, mul, *], [Div, div, /], [Add, add, +], [Sub, sub, -],
          [Shl, shl, <<], [Shr, shr, >>], [Rem, rem, %], [BitAnd, bitand, &],
          [BitOr, bitor, |], [BitXor, bitxor, ^]);
impl_ops!(u16, u16x32, [Mul, mul, *], [Div, div, /], [Add, add, +], [Sub, sub, -],
          [Shl, shl, <<], [Shr, shr, >>], [Rem, rem, %], [BitAnd, bitand, &],
          [BitOr, bitor, |], [BitXor, bitxor, ^]);
impl_ops!(i32, i32x16, [Mul, mul, *], [Div, div, /], [Add, add, +], [Sub, sub, -],
          [Shl, shl, <<], [Shr, shr, >>], [Rem, rem, %], [BitAnd, bitand, &],
          [BitOr, bitor, |], [BitXor, bitxor, ^]);
impl_ops!(u32, u32x16, [Mul, mul, *], [Div, div, /], [Add, add, +], [Sub, sub, -],
          [Shl, shl, <<], [Shr, shr, >>], [Rem, rem, %], [BitAnd, bitand, &],
          [BitOr, bitor, |], [BitXor, bitxor, ^]);
impl_ops!(f32, f32x16, [Mul, mul, *], [Div, div, /], [Add, add, +], [Sub, sub, -]);
impl_ops!(i64, i64x8, [Mul, mul, *], [Div, div, /], [Add, add, +], [Sub, sub, -],
          [Shl, shl, <<], [Shr, shr, >>], [Rem, rem, %], [BitAnd, bitand, &],
          [BitOr, bitor, |], [BitXor, bitxor, ^]);
impl_ops!(u64, u64x8, [Mul, mul, *], [Div, div, /], [Add, add, +], [Sub, sub, -],
          [Shl, shl, <<], [Shr, shr, >>], [Rem, rem, %], [BitAnd, bitand, &],
          [BitOr, bitor, |], [BitXor, bitxor, ^]);
impl_ops!(f64, f64x8, [Mul, mul, *], [Div, div, /], [Add, add, +], [Sub, sub, -]);

impl_assignops!(i8, i8x16, [MulAssign, mul_assign, *=], [DivAssign, div_assign, /=], [AddAssign, add_assign, +=], [SubAssign, sub_assign, -=],
                [ShlAssign, shl_assign, <<=], [ShrAssign, shr_assign, >>=], [RemAssign, rem_assign, %=], [BitAndAssign, bitand_assign, &=],
                [BitOrAssign, bitor_assign, |=], [BitXorAssign, bitxor_assign, ^=]);
impl_assignops!(u8, u8x16, [MulAssign, mul_assign, *=], [DivAssign, div_assign, /=], [AddAssign, add_assign, +=], [SubAssign, sub_assign, -=],
                [ShlAssign, shl_assign, <<=], [ShrAssign, shr_assign, >>=], [RemAssign, rem_assign, %=], [BitAndAssign, bitand_assign, &=],
                [BitOrAssign, bitor_assign, |=], [BitXorAssign, bitxor_assign, ^=]);
impl_assignops!(i16, i16x8, [MulAssign, mul_assign, *=], [DivAssign, div_assign, /=], [AddAssign, add_assign, +=], [SubAssign, sub_assign, -=],
                [ShlAssign, shl_assign, <<=], [ShrAssign, shr_assign, >>=], [RemAssign, rem_assign, %=], [BitAndAssign, bitand_assign, &=],
                [BitOrAssign, bitor_assign, |=], [BitXorAssign, bitxor_assign, ^=]);
impl_assignops!(u16, u16x8, [MulAssign, mul_assign, *=], [DivAssign, div_assign, /=], [AddAssign, add_assign, +=], [SubAssign, sub_assign, -=],
                [ShlAssign, shl_assign, <<=], [ShrAssign, shr_assign, >>=], [RemAssign, rem_assign, %=], [BitAndAssign, bitand_assign, &=],
                [BitOrAssign, bitor_assign, |=], [BitXorAssign, bitxor_assign, ^=]);
impl_assignops!(i32, i32x4, [MulAssign, mul_assign, *=], [DivAssign, div_assign, /=], [AddAssign, add_assign, +=], [SubAssign, sub_assign, -=],
                [ShlAssign, shl_assign, <<=], [ShrAssign, shr_assign, >>=], [RemAssign, rem_assign, %=], [BitAndAssign, bitand_assign, &=],
                [BitOrAssign, bitor_assign, |=], [BitXorAssign, bitxor_assign, ^=]);
impl_assignops!(u32, u32x4, [MulAssign, mul_assign, *=], [DivAssign, div_assign, /=], [AddAssign, add_assign, +=], [SubAssign, sub_assign, -=],
                [ShlAssign, shl_assign, <<=], [ShrAssign, shr_assign, >>=], [RemAssign, rem_assign, %=], [BitAndAssign, bitand_assign, &=],
                [BitOrAssign, bitor_assign, |=], [BitXorAssign, bitxor_assign, ^=]);
impl_assignops!(f32, f32x4, [MulAssign, mul_assign, *=], [DivAssign, div_assign, /=], [AddAssign, add_assign, +=], [SubAssign, sub_assign, -=]);
impl_assignops!(i64, i64x2, [MulAssign, mul_assign, *=], [DivAssign, div_assign, /=], [AddAssign, add_assign, +=], [SubAssign, sub_assign, -=],
                [ShlAssign, shl_assign, <<=], [ShrAssign, shr_assign, >>=], [RemAssign, rem_assign, %=], [BitAndAssign, bitand_assign, &=],
                [BitOrAssign, bitor_assign, |=], [BitXorAssign, bitxor_assign, ^=]);
impl_assignops!(u64, u64x2, [MulAssign, mul_assign, *=], [DivAssign, div_assign, /=], [AddAssign, add_assign, +=], [SubAssign, sub_assign, -=],
                [ShlAssign, shl_assign, <<=], [ShrAssign, shr_assign, >>=], [RemAssign, rem_assign, %=], [BitAndAssign, bitand_assign, &=],
                [BitOrAssign, bitor_assign, |=], [BitXorAssign, bitxor_assign, ^=]);
impl_assignops!(f64, f64x2, [MulAssign, mul_assign, *=], [DivAssign, div_assign, /=], [AddAssign, add_assign, +=], [SubAssign, sub_assign, -=]);

impl_assignops!(i8, i8x32, [MulAssign, mul_assign, *=], [DivAssign, div_assign, /=], [AddAssign, add_assign, +=], [SubAssign, sub_assign, -=],
                [ShlAssign, shl_assign, <<=], [ShrAssign, shr_assign, >>=], [RemAssign, rem_assign, %=], [BitAndAssign, bitand_assign, &=],
                [BitOrAssign, bitor_assign, |=], [BitXorAssign, bitxor_assign, ^=]);
impl_assignops!(u8, u8x32, [MulAssign, mul_assign, *=], [DivAssign, div_assign, /=], [AddAssign, add_assign, +=], [SubAssign, sub_assign, -=],
                [ShlAssign, shl_assign, <<=], [ShrAssign, shr_assign, >>=], [RemAssign, rem_assign, %=], [BitAndAssign, bitand_assign, &=],
                [BitOrAssign, bitor_assign, |=], [BitXorAssign, bitxor_assign, ^=]);
impl_assignops!(i16, i16x16, [MulAssign, mul_assign, *=], [DivAssign, div_assign, /=], [AddAssign, add_assign, +=], [SubAssign, sub_assign, -=],
                [ShlAssign, shl_assign, <<=], [ShrAssign, shr_assign, >>=], [RemAssign, rem_assign, %=], [BitAndAssign, bitand_assign, &=],
                [BitOrAssign, bitor_assign, |=], [BitXorAssign, bitxor_assign, ^=]);
impl_assignops!(u16, u16x16, [MulAssign, mul_assign, *=], [DivAssign, div_assign, /=], [AddAssign, add_assign, +=], [SubAssign, sub_assign, -=],
                [ShlAssign, shl_assign, <<=], [ShrAssign, shr_assign, >>=], [RemAssign, rem_assign, %=], [BitAndAssign, bitand_assign, &=],
                [BitOrAssign, bitor_assign, |=], [BitXorAssign, bitxor_assign, ^=]);
impl_assignops!(i32, i32x8, [MulAssign, mul_assign, *=], [DivAssign, div_assign, /=], [AddAssign, add_assign, +=], [SubAssign, sub_assign, -=],
                [ShlAssign, shl_assign, <<=], [ShrAssign, shr_assign, >>=], [RemAssign, rem_assign, %=], [BitAndAssign, bitand_assign, &=],
                [BitOrAssign, bitor_assign, |=], [BitXorAssign, bitxor_assign, ^=]);
impl_assignops!(u32, u32x8, [MulAssign, mul_assign, *=], [DivAssign, div_assign, /=], [AddAssign, add_assign, +=], [SubAssign, sub_assign, -=],
                [ShlAssign, shl_assign, <<=], [ShrAssign, shr_assign, >>=], [RemAssign, rem_assign, %=], [BitAndAssign, bitand_assign, &=],
                [BitOrAssign, bitor_assign, |=], [BitXorAssign, bitxor_assign, ^=]);
impl_assignops!(f32, f32x8, [MulAssign, mul_assign, *=], [DivAssign, div_assign, /=], [AddAssign, add_assign, +=], [SubAssign, sub_assign, -=]);
impl_assignops!(i64, i64x4, [MulAssign, mul_assign, *=], [DivAssign, div_assign, /=], [AddAssign, add_assign, +=], [SubAssign, sub_assign, -=],
                [ShlAssign, shl_assign, <<=], [ShrAssign, shr_assign, >>=], [RemAssign, rem_assign, %=], [BitAndAssign, bitand_assign, &=],
                [BitOrAssign, bitor_assign, |=], [BitXorAssign, bitxor_assign, ^=]);
impl_assignops!(u64, u64x4, [MulAssign, mul_assign, *=], [DivAssign, div_assign, /=], [AddAssign, add_assign, +=], [SubAssign, sub_assign, -=],
                [ShlAssign, shl_assign, <<=], [ShrAssign, shr_assign, >>=], [RemAssign, rem_assign, %=], [BitAndAssign, bitand_assign, &=],
                [BitOrAssign, bitor_assign, |=], [BitXorAssign, bitxor_assign, ^=]);
impl_assignops!(f64, f64x4, [MulAssign, mul_assign, *=], [DivAssign, div_assign, /=], [AddAssign, add_assign, +=], [SubAssign, sub_assign, -=]);

impl_assignops!(i8, i8x64, [MulAssign, mul_assign, *=], [DivAssign, div_assign, /=], [AddAssign, add_assign, +=], [SubAssign, sub_assign, -=],
                [ShlAssign, shl_assign, <<=], [ShrAssign, shr_assign, >>=], [RemAssign, rem_assign, %=], [BitAndAssign, bitand_assign, &=],
                [BitOrAssign, bitor_assign, |=], [BitXorAssign, bitxor_assign, ^=]);
impl_assignops!(u8, u8x64, [MulAssign, mul_assign, *=], [DivAssign, div_assign, /=], [AddAssign, add_assign, +=], [SubAssign, sub_assign, -=],
                [ShlAssign, shl_assign, <<=], [ShrAssign, shr_assign, >>=], [RemAssign, rem_assign, %=], [BitAndAssign, bitand_assign, &=],
                [BitOrAssign, bitor_assign, |=], [BitXorAssign, bitxor_assign, ^=]);
impl_assignops!(i16, i16x32, [MulAssign, mul_assign, *=], [DivAssign, div_assign, /=], [AddAssign, add_assign, +=], [SubAssign, sub_assign, -=],
                [ShlAssign, shl_assign, <<=], [ShrAssign, shr_assign, >>=], [RemAssign, rem_assign, %=], [BitAndAssign, bitand_assign, &=],
                [BitOrAssign, bitor_assign, |=], [BitXorAssign, bitxor_assign, ^=]);
impl_assignops!(u16, u16x32, [MulAssign, mul_assign, *=], [DivAssign, div_assign, /=], [AddAssign, add_assign, +=], [SubAssign, sub_assign, -=],
                [ShlAssign, shl_assign, <<=], [ShrAssign, shr_assign, >>=], [RemAssign, rem_assign, %=], [BitAndAssign, bitand_assign, &=],
                [BitOrAssign, bitor_assign, |=], [BitXorAssign, bitxor_assign, ^=]);
impl_assignops!(i32, i32x16, [MulAssign, mul_assign, *=], [DivAssign, div_assign, /=], [AddAssign, add_assign, +=], [SubAssign, sub_assign, -=],
                [ShlAssign, shl_assign, <<=], [ShrAssign, shr_assign, >>=], [RemAssign, rem_assign, %=], [BitAndAssign, bitand_assign, &=],
                [BitOrAssign, bitor_assign, |=], [BitXorAssign, bitxor_assign, ^=]);
impl_assignops!(u32, u32x16, [MulAssign, mul_assign, *=], [DivAssign, div_assign, /=], [AddAssign, add_assign, +=], [SubAssign, sub_assign, -=],
                [ShlAssign, shl_assign, <<=], [ShrAssign, shr_assign, >>=], [RemAssign, rem_assign, %=], [BitAndAssign, bitand_assign, &=],
                [BitOrAssign, bitor_assign, |=], [BitXorAssign, bitxor_assign, ^=]);
impl_assignops!(f32, f32x16, [MulAssign, mul_assign, *=], [DivAssign, div_assign, /=], [AddAssign, add_assign, +=], [SubAssign, sub_assign, -=]);
impl_assignops!(i64, i64x8, [MulAssign, mul_assign, *=], [DivAssign, div_assign, /=], [AddAssign, add_assign, +=], [SubAssign, sub_assign, -=],
                [ShlAssign, shl_assign, <<=], [ShrAssign, shr_assign, >>=], [RemAssign, rem_assign, %=], [BitAndAssign, bitand_assign, &=],
                [BitOrAssign, bitor_assign, |=], [BitXorAssign, bitxor_assign, ^=]);
impl_assignops!(u64, u64x8, [MulAssign, mul_assign, *=], [DivAssign, div_assign, /=], [AddAssign, add_assign, +=], [SubAssign, sub_assign, -=],
                [ShlAssign, shl_assign, <<=], [ShrAssign, shr_assign, >>=], [RemAssign, rem_assign, %=], [BitAndAssign, bitand_assign, &=],
                [BitOrAssign, bitor_assign, |=], [BitXorAssign, bitxor_assign, ^=]);
impl_assignops!(f64, f64x8, [MulAssign, mul_assign, *=], [DivAssign, div_assign, /=], [AddAssign, add_assign, +=], [SubAssign, sub_assign, -=]);

impl_cast!(i8x16, u8x16, u8, as_u8x16);
impl_cast!(u8x16, i8x16, i8, as_i8x16);

impl_cast!(i8x32, u8x32, u8, as_u8x32);
impl_cast!(u8x32, i8x32, i8, as_i8x32);

impl_cast!(i8x64, u8x64, u8, as_u8x64);
impl_cast!(u8x64, i8x64, i8, as_i8x64);

impl_cast!(i16x8, u16x8, u16, as_u16x8);
impl_cast!(u16x8, i16x8, i16, as_i16x8);

impl_cast!(i16x16, u16x16, u16, as_u16x16);
impl_cast!(u16x16, i16x16, i16, as_i16x16);

impl_cast!(i16x32, u16x32, u16, as_u16x32);
impl_cast!(u16x32, i16x32, i16, as_i16x32);

impl_cast!(i32x4, u32x4, u32, as_u32x4);
impl_cast!(f32x4, u32x4, u32, as_u32x4);
impl_cast!(f32x4, i32x4, i32, as_i32x4);
impl_cast!(u32x4, i32x4, i32, as_i32x4);
impl_cast!(u32x4, f32x4, f32, as_f32x4);
impl_cast!(i32x4, f32x4, f32, as_f32x4);

impl_cast!(i32x8, u32x8, u32, as_u32x8);
impl_cast!(f32x8, u32x8, u32, as_u32x8);
impl_cast!(f32x8, i32x8, i32, as_i32x8);
impl_cast!(u32x8, i32x8, i32, as_i32x8);
impl_cast!(u32x8, f32x8, f32, as_f32x8);
impl_cast!(i32x8, f32x8, f32, as_f32x8);

impl_cast!(i32x16, u32x16, u32, as_u32x16);
impl_cast!(f32x16, u32x16, u32, as_u32x16);
impl_cast!(f32x16, i32x16, i32, as_i32x16);
impl_cast!(u32x16, i32x16, i32, as_i32x16);
impl_cast!(u32x16, f32x16, f32, as_f32x16);
impl_cast!(i32x16, f32x16, f32, as_f32x16);

impl_cast!(i64x2, u64x2, u64, as_u64x2);
impl_cast!(f64x2, u64x2, u64, as_u64x2);
impl_cast!(f64x2, i64x2, i64, as_i64x2);
impl_cast!(u64x2, i64x2, i64, as_i64x2);
impl_cast!(u64x2, f64x2, f64, as_f64x2);
impl_cast!(i64x2, f64x2, f64, as_f64x2);

impl_cast!(i64x4, u64x4, u64, as_u64x4);
impl_cast!(f64x4, u64x4, u64, as_u64x4);
impl_cast!(f64x4, i64x4, i64, as_i64x4);
impl_cast!(u64x4, i64x4, i64, as_i64x4);
impl_cast!(u64x4, f64x4, f64, as_f64x4);
impl_cast!(i64x4, f64x4, f64, as_f64x4);

impl_cast!(i64x8, u64x8, u64, as_u64x8);
impl_cast!(f64x8, u64x8, u64, as_u64x8);
impl_cast!(f64x8, i64x8, i64, as_i64x8);
impl_cast!(u64x8, i64x8, i64, as_i64x8);
impl_cast!(u64x8, f64x8, f64, as_f64x8);
impl_cast!(i64x8, f64x8, f64, as_f64x8);