1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
//! Endian-aware u16/u32 SIMD loaders for AArch64 NEON.
// tier kernels (Phase 2 rollout PRs) will consume these
//!
//! Each helper takes a raw byte pointer to LE-encoded (or BE-encoded) data
//! and returns a NEON vector containing the elements in **host-native** byte
//! order, ready for native u16/u32 SIMD math.
//!
//! The host-native conversion is monomorphized at compile time via
//! `cfg(target_endian = ...)`:
//! - `load_le_*` is a no-op on LE targets, byte-swap on BE targets
//! - `load_be_*` is byte-swap on LE targets, no-op on BE targets
//!
//! Tier kernels call the generic dispatchers `load_endian_u16x8::<BE>` and
//! `load_endian_u32x4::<BE>` from their own `<const BE: bool>` contexts.
//! The `if BE { ... } else { ... }` in the dispatcher is eliminated by the
//! compiler — each monomorphization sees only one branch.
use *;
// ---- u16x4 loaders ---------------------------------------------------------
/// Loads 4 x u16 from `ptr` (LE-encoded on disk/wire) into host-native order.
///
/// # Safety
///
/// `ptr` must point to at least 8 readable bytes, aligned to at least 1 byte.
/// Caller must have NEON enabled.
pub unsafe
/// Loads 4 x u16 from `ptr` (BE-encoded on disk/wire) into host-native order.
///
/// # Safety
///
/// `ptr` must point to at least 8 readable bytes, aligned to at least 1 byte.
/// Caller must have NEON enabled.
pub unsafe
/// Generic dispatcher: routes to `load_le_u16x4` or `load_be_u16x4`.
///
/// # Safety
///
/// Same as `load_le_u16x4` / `load_be_u16x4`.
pub unsafe
// ---- u16x8 loaders ---------------------------------------------------------
/// Loads 8 x u16 from `ptr` (LE-encoded on disk/wire) into host-native order.
///
/// # Safety
///
/// `ptr` must point to at least 16 readable bytes, aligned to at least 1 byte.
/// Caller must have NEON enabled (via `#[target_feature(enable = "neon")]`).
pub unsafe
/// Loads 8 x u16 from `ptr` (BE-encoded on disk/wire) into host-native order.
///
/// # Safety
///
/// `ptr` must point to at least 16 readable bytes, aligned to at least 1 byte.
/// Caller must have NEON enabled (via `#[target_feature(enable = "neon")]`).
pub unsafe
/// Generic dispatcher: routes to `load_le_u16x8` or `load_be_u16x8` based on
/// the compile-time `BE` const parameter. The unused branch is eliminated by
/// the compiler when the caller is monomorphized.
///
/// # Safety
///
/// Same as `load_le_u16x8` / `load_be_u16x8`.
pub unsafe
// ---- u32x4 loaders ---------------------------------------------------------
/// Loads 4 x u32 from `ptr` (LE-encoded on disk/wire) into host-native order.
///
/// # Safety
///
/// `ptr` must point to at least 16 readable bytes, aligned to at least 1 byte.
/// Caller must have NEON enabled.
pub unsafe
/// Loads 4 x u32 from `ptr` (BE-encoded on disk/wire) into host-native order.
///
/// # Safety
///
/// `ptr` must point to at least 16 readable bytes, aligned to at least 1 byte.
/// Caller must have NEON enabled.
pub unsafe
/// Generic dispatcher: routes to `load_le_u32x4` or `load_be_u32x4` based on
/// the compile-time `BE` const parameter.
///
/// # Safety
///
/// Same as `load_le_u32x4` / `load_be_u32x4`.
pub unsafe