1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
//! Alphabet-based bitmask pre-filtering for ultra-fast chunk skipping.
//!
//! This provides a "Layer 0" screen that can discard non-matching chunks
//! in O(N) with very low constant factors using bit-parallelism.
#![deny(unsafe_op_in_unsafe_fn)]
/// A 256-bit mask representing the presence of all ASCII characters.
#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)]
pub struct AlphabetMask {
mask: [u64; 4],
}
impl AlphabetMask {
/// Create a mask from a slice of bytes.
pub fn from_bytes(bytes: &[u8]) -> Self {
#[cfg(target_arch = "x86_64")]
{
if is_x86_feature_detected!("avx2") {
// SAFETY: We just checked for AVX2 support.
return unsafe { Self::from_bytes_avx2(bytes) };
}
if is_x86_feature_detected!("sse2") {
// SAFETY: SSE2 is a baseline for x86_64 but we gate it for clarity.
return unsafe { Self::from_bytes_sse2(bytes) };
}
}
#[cfg(target_arch = "aarch64")]
{
// SAFETY: ARM NEON is always available on aarch64.
return unsafe { Self::from_bytes_neon(bytes) };
}
Self::from_bytes_scalar(bytes)
}
pub fn from_bytes_scalar(bytes: &[u8]) -> Self {
let mut mask = [0u64; 4];
for &b in bytes {
mask[(b / 64) as usize] |= 1 << (b % 64);
}
Self { mask }
}
/// Build an [`AlphabetMask`] from `bytes` using the NEON-friendly
/// 16-byte-chunked loop. Public so the prefilter-robustness
/// proptest can compare SIMD output to the scalar fallback.
///
/// # Safety
/// Caller must run on an aarch64 target with NEON available. The
/// `#[cfg(target_arch = "aarch64")]` gate guarantees the first;
/// NEON is baseline on every Rust-supported aarch64 target so the
/// second is trivially true. The body is otherwise safe Rust.
#[cfg(target_arch = "aarch64")]
pub unsafe fn from_bytes_neon(bytes: &[u8]) -> Self {
let mut mask = [0u64; 4];
let chunks = bytes.chunks_exact(16);
let remainder = chunks.remainder();
for chunk in chunks {
for &b in chunk {
mask[(b / 64) as usize] |= 1 << (b % 64);
}
}
for &b in remainder {
mask[(b / 64) as usize] |= 1 << (b % 64);
}
Self { mask }
}
/// Build an [`AlphabetMask`] from `bytes` using the 4-byte unrolled
/// AVX2 body. Public so the prefilter-robustness proptest can
/// compare SIMD output to the scalar fallback.
///
/// # Safety
/// Caller must run on an x86_64 CPU that supports AVX2. The
/// `#[target_feature(enable = "avx2")]` attribute makes this a
/// caller obligation; invoking on a non-AVX2 host is UB.
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
pub unsafe fn from_bytes_avx2(bytes: &[u8]) -> Self {
let mut mask = [0u64; 4];
let chunks = bytes.chunks_exact(4);
let remainder = chunks.remainder();
for chunk in chunks {
mask[(chunk[0] / 64) as usize] |= 1 << (chunk[0] % 64);
mask[(chunk[1] / 64) as usize] |= 1 << (chunk[1] % 64);
mask[(chunk[2] / 64) as usize] |= 1 << (chunk[2] % 64);
mask[(chunk[3] / 64) as usize] |= 1 << (chunk[3] % 64);
}
for &b in remainder {
mask[(b / 64) as usize] |= 1 << (b % 64);
}
Self { mask }
}
/// Build an [`AlphabetMask`] from `bytes` using the SSE2 baseline.
/// Public so the prefilter-robustness proptest can compare SIMD
/// output to the scalar fallback.
///
/// # Safety
/// Caller must run on an x86_64 CPU. SSE2 is mandatory on x86_64
/// per the SysV ABI, so the safety requirement is trivially met on
/// any host the `#[cfg]` permits; the `#[target_feature]` attribute
/// formalizes the caller obligation.
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "sse2")]
pub unsafe fn from_bytes_sse2(bytes: &[u8]) -> Self {
let mut mask = [0u64; 4];
for &b in bytes {
mask[(b / 64) as usize] |= 1 << (b % 64);
}
Self { mask }
}
/// Create a mask from a string.
pub fn from_text(s: &str) -> Self {
Self::from_bytes(s.as_bytes())
}
/// Check if two masks have any common bits set.
pub fn intersects(&self, other: &Self) -> bool {
(self.mask[0] & other.mask[0]) != 0
|| (self.mask[1] & other.mask[1]) != 0
|| (self.mask[2] & other.mask[2]) != 0
|| (self.mask[3] & other.mask[3]) != 0
}
/// Union two masks together.
pub fn union(&mut self, other: &Self) {
self.mask[0] |= other.mask[0];
self.mask[1] |= other.mask[1];
self.mask[2] |= other.mask[2];
self.mask[3] |= other.mask[3];
}
}
/// A pre-filter that uses an [`AlphabetMask`] to quickly skip chunks.
#[derive(Clone, Debug, Default)]
pub struct AlphabetScreen {
pub target_mask: AlphabetMask,
}
impl AlphabetScreen {
/// Create a new screen from a set of target strings (literals or keywords).
pub fn new(targets: &[String]) -> Self {
let mut target_mask = AlphabetMask::default();
for target in targets {
target_mask.union(&AlphabetMask::from_text(target));
// Ensure case-insensitivity for the pre-screen
target_mask.union(&AlphabetMask::from_text(&target.to_lowercase()));
target_mask.union(&AlphabetMask::from_text(&target.to_uppercase()));
}
Self { target_mask }
}
/// Quick screen of a data chunk.
pub fn screen(&self, data: &[u8]) -> bool {
if data.is_empty() {
return false;
}
#[cfg(target_arch = "x86_64")]
{
if is_x86_feature_detected!("avx2") {
// SAFETY: We just checked for AVX2 support.
return unsafe { self.screen_avx2(data) };
}
}
// Fallback to building the mask and intersecting.
// This is actually faster than a simple scalar search for 1MB no-match.
self.target_mask.intersects(&AlphabetMask::from_bytes(data))
}
/// AVX2 implementation of [`screen`](Self::screen). Public so the
/// prefilter-robustness proptest can compare SIMD output to the
/// scalar fallback.
///
/// # Safety
/// Caller must run on an x86_64 CPU that supports AVX2. The
/// `#[target_feature(enable = "avx2")]` attribute makes this a
/// caller obligation; invoking on a non-AVX2 host is UB.
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
pub unsafe fn screen_avx2(&self, data: &[u8]) -> bool {
use std::arch::x86_64::*;
// SAFETY: `target_mask.mask` is `[u64; 4]` (32 bytes total). Slicing
// `[..2]` gives 16 bytes / `[2..]` gives 16 bytes - exactly what
// `_mm_loadu_si128` needs. Loadu permits unaligned pointers.
// AVX2 availability is enforced by the surrounding
// `#[target_feature(enable = "avx2")]`. kimi-wave1 finding 6.LOW.alphabet_filter.rs.162.
let (bitset_low, bitset_high, bit_selector) = unsafe {
let low_mask = _mm_loadu_si128(self.target_mask.mask[..2].as_ptr() as *const __m128i);
let high_mask = _mm_loadu_si128(self.target_mask.mask[2..].as_ptr() as *const __m128i);
(
_mm256_set_m128i(low_mask, low_mask),
_mm256_set_m128i(high_mask, high_mask),
_mm256_setr_epi8(
1, 2, 4, 8, 16, 32, 64, -128, 1, 2, 4, 8, 16, 32, 64, -128, 1, 2, 4, 8, 16, 32,
64, -128, 1, 2, 4, 8, 16, 32, 64, -128,
),
)
};
let chunks = data.chunks_exact(32);
let remainder = chunks.remainder();
for chunk in chunks {
// SAFETY: `chunks_exact(32)` guarantees chunk.len() == 32, so
// `chunk.as_ptr()` is valid for a 32-byte AVX2 unaligned load.
// Subsequent intrinsics are pure register ops; AVX2 availability
// is enforced by the surrounding `#[target_feature(enable =
// "avx2")]`. kimi-wave1 finding 6.LOW.alphabet_filter.rs.180.
unsafe {
let v = _mm256_loadu_si256(chunk.as_ptr() as *const __m256i);
// bit_index = v & 7
let bit_indices = _mm256_and_si256(v, _mm256_set1_epi8(0x07));
let bits = _mm256_shuffle_epi8(bit_selector, bit_indices);
// byte_index = (v >> 3) & 0x0F
let byte_indices =
_mm256_and_si256(_mm256_srli_epi16(v, 3), _mm256_set1_epi8(0x0F));
let is_128_255 = _mm256_cmpgt_epi8(_mm256_setzero_si256(), v); // Bit 7 set
let row_low = _mm256_shuffle_epi8(bitset_low, byte_indices);
let row_high = _mm256_shuffle_epi8(bitset_high, byte_indices);
let row = _mm256_blendv_epi8(row_low, row_high, is_128_255);
if _mm256_testz_si256(row, bits) == 0 {
return true;
}
}
}
for &b in remainder {
if (self.target_mask.mask[(b / 64) as usize] & (1 << (b % 64))) != 0 {
return true;
}
}
false
}
}