1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
// https://pastebin.com/uRuPq2VE (assembly)
//
// C version using intrinsics:
//
// void simd_mull_reduce_poly8x8(poly8x8_t *result,
// poly8x8_t *a, poly8x8_t *b) {
//
// // do non-modular poly multiply
// poly16x8_t working = vmull_p8(*a,*b);
//
// // copy result, and shift right
// uint16x8_t top_nibble = vshrq_n_u16 ((uint16x8_t) working, 12);
//
// // was uint8x16_t, but vtbl
// static uint8x8x2_t u4_0x11b_mod_table = {
// 0x00, 0x1b, 0x36, 0x2d, 0x6c, 0x77, 0x5a, 0x41,
// 0xd8, 0xc3, 0xee, 0xf5, 0xb4, 0xaf, 0x82, 0x99,
// };
//
//
// uint8x8_t reduced = vmovn_u16(top_nibble);
//
// // now we should have what we need to do 8x8 table lookups
// uint8x8_t lut = vtbl2_u8(u4_0x11b_mod_table, reduced);
//
// // Next, have to convert u8 to u16, shifting left 4 bits
// poly16x8_t widened = (poly16x8_t) vmovl_u8(lut);
//
// // uint16x8_t vshlq_n_u16 (uint16x8_t, const int)
// // Form of expected instruction(s): vshl.i16 q0, q0, #0
// widened = (poly16x8_t) vshlq_n_u16((uint16x8_t) widened, 4);
//
// // uint16x8_t veorqq_u16 (uint16x8_t, uint16x8_t)
// // Form of expected instruction(s): veorq q0, q0, q0
// working = (poly16x8_t) veorq_u16((uint16x8_t) working, (uint16x8_t) widened);
//
// // First LUT complete... repeat steps
//
// // extra step to clear top nibble
// top_nibble = vshlq_n_u16 ((uint16x8_t) working, 4);
// // to get at the one to its right
// top_nibble = vshrq_n_u16 ((uint16x8_t) top_nibble, 12);
// reduced = vmovn_u16(top_nibble);
// lut = vtbl2_u8(u4_0x11b_mod_table, reduced);
// widened = (poly16x8_t) vmovl_u8(lut);
// // remove step, since we're applying to low byte
// // widened = (poly16x8_t) vshlq_n_u16((uint16x8_t) widened, 4);
// working = (poly16x8_t) veorq_u16((uint16x8_t) working, (uint16x8_t) widened);
//
// // apply mask (vand expects 2 registers, so use shl, shr combo)
// // working = (poly16x8_t) vshlq_n_u16 ((uint16x8_t) working, 8);
// // working = (poly16x8_t) vshrq_n_u16 ((uint16x8_t) working, 8);
//
// // use narrowing mov to send back result
// *result = (poly8x8_t) vmovn_u16((uint16x8_t) working);
// }
use *;
// looking at https://doc.rust-lang.org/core/arch/arm/
//
// all sorts of intrinsics are missing... not just vmull
//
// * vmull: no
// * vmovn_u16: ok
// * vmovl_u8: ok
// * vtbl2_u8: no (no vtbl instructions at all)
// * veorq_u16: ok
// * vshrq_n_u16: no (only vshrq_n_u8)
// * vshlq_n_u16: no (only vshlq_n_u8)
//
// Wait! https://docs.rs/core_arch/0.1.5/core_arch/aarch64/index.html
//
// These are listed under aarch64, even though I can use those
// intrinsics on a 32-bit neon machine.