fastlanes 0.5.1

Rust implementation of the FastLanes compression layout
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
//! Fast implementations of the `FastLanes` 1024-bit transpose.
//!
//! The `FastLanes` transpose is a fixed permutation of 1024 bits (16 × `u64`,
//! i.e. 128 bytes) that enables SIMD parallelism for encodings like delta and
//! RLE, and for transposing validity bitmaps. This module provides architecture
//! specific implementations of both the transpose and its inverse.
//!
//! The key insight is that each output byte is formed by extracting the SAME bit
//! position from 8 different input bytes at stride 16. The input byte groups follow
//! the [`crate::FL_ORDER`] permutation pattern.
//!
//! # Choosing an implementation
//!
//! [`transpose_bits`] / [`untranspose_bits`] dispatch to the fastest available
//! implementation. When the crate is built with the `std` feature this dispatch
//! is performed at runtime via CPU feature detection; otherwise it is resolved at
//! compile time from the enabled `target_feature`s, falling back to the portable
//! scalar implementation.
//!
//! Every implementation is also exposed directly so that a downstream crate can
//! select one explicitly even in a `no_std` build: [`scalar`], [`x86`]
//! (`bmi2` / `avx512vbmi`), and [`aarch64`] (`neon`).
//!
//! All entry points operate over `[u64; 16]` (one 1024-bit block); bytes are
//! interpreted in little-endian order.

pub mod scalar;

#[cfg(target_arch = "x86_64")]
pub mod x86;

#[cfg(target_arch = "aarch64")]
pub mod aarch64;

/// Base indices for the first 64 output bytes (lanes 0-7).
/// Each entry indicates the starting input byte index for that output byte group.
/// Pattern: `[0*2, 4*2, 2*2, 6*2, 1*2, 5*2, 3*2, 7*2]` = `[0, 8, 4, 12, 2, 10, 6, 14]`
pub(crate) const BASE_PATTERN_FIRST: [usize; 8] = [0, 8, 4, 12, 2, 10, 6, 14];

/// Base indices for the second 64 output bytes (lanes 8-15).
/// Pattern: first pattern + 1 = `[1, 9, 5, 13, 3, 11, 7, 15]`
pub(crate) const BASE_PATTERN_SECOND: [usize; 8] = [1, 9, 5, 13, 3, 11, 7, 15];

/// Masks for transposing 8x8 bit blocks.
pub(crate) const TRANSPOSE_2X2: u64 = 0x00AA_00AA_00AA_00AA;
pub(crate) const TRANSPOSE_4X4: u64 = 0x0000_CCCC_0000_CCCC;
pub(crate) const TRANSPOSE_8X8: u64 = 0x0000_0000_F0F0_F0F0;

/// Group-major gather/scatter permutation tables shared by the SIMD width-generic untranspose
/// kernels (NEON and AVX-512 VBMI). Only built for the architectures that have such a kernel.
#[cfg(any(target_arch = "x86_64", target_arch = "aarch64"))]
pub(crate) mod group_perm {
    /// Byte-gather indices that pull the 8 bytes of every group into contiguous group-major order
    /// for an element width of `tb` bits. Group `g = lhi * (tb/8) + hi` collects byte `llo` of its
    /// lane words from input byte `lhi*tb + hi + llo*(tb/8)`. See
    /// [`crate::bit_transpose::scalar::untranspose_bits`].
    const fn gather_indices(tb: usize) -> [u8; 128] {
        let bytes = tb / 8;
        let mut idx = [0u8; 128];
        let mut g = 0;
        while g < 16 {
            let lhi = g / bytes;
            let hi = g % bytes;
            let gather_base = lhi * tb + hi;
            let mut llo = 0;
            while llo < 8 {
                idx[g * 8 + llo] = (gather_base + llo * bytes) as u8;
                llo += 1;
            }
            g += 1;
        }
        idx
    }

    /// Byte-scatter indices applied after the per-group 8x8 transpose: transposed byte `g*8 + lo`
    /// lands at logical byte `FL_ORDER[hi]*2 + lhi + lo*16`. Expressed as a gather table:
    /// `idx[logical_byte] = g*8 + lo`.
    const fn scatter_indices(tb: usize) -> [u8; 128] {
        let bytes = tb / 8;
        let mut idx = [0u8; 128];
        let mut g = 0;
        while g < 16 {
            let lhi = g / bytes;
            let hi = g % bytes;
            let scatter_base = crate::FL_ORDER[hi] * 2 + lhi;
            let mut lo = 0;
            while lo < 8 {
                idx[scatter_base + lo * 16] = (g * 8 + lo) as u8;
                lo += 1;
            }
            g += 1;
        }
        idx
    }

    static GATHER_8: [u8; 128] = gather_indices(8);
    static GATHER_16: [u8; 128] = gather_indices(16);
    static GATHER_32: [u8; 128] = gather_indices(32);
    static GATHER_64: [u8; 128] = gather_indices(64);

    static SCATTER_8: [u8; 128] = scatter_indices(8);
    static SCATTER_16: [u8; 128] = scatter_indices(16);
    static SCATTER_32: [u8; 128] = scatter_indices(32);
    static SCATTER_64: [u8; 128] = scatter_indices(64);

    /// Select the `(gather, scatter)` group-major permutation tables for an element width.
    #[inline]
    pub(crate) fn group_tables<T: crate::FastLanes>() -> (&'static [u8; 128], &'static [u8; 128]) {
        match T::T {
            8 => (&GATHER_8, &SCATTER_8),
            16 => (&GATHER_16, &SCATTER_16),
            32 => (&GATHER_32, &SCATTER_32),
            _ => (&GATHER_64, &SCATTER_64),
        }
    }
}

/// Reinterpret a 1024-bit block (`[u64; 16]`) as its 128 little-endian bytes.
#[inline]
#[must_use]
pub(crate) fn as_byte_array(block: &[u64; 16]) -> &[u8; 128] {
    // SAFETY: `[u64; 16]` and `[u8; 128]` have identical size (128 bytes). Every bit
    // pattern is a valid `u8`, and the source is over-aligned for a `u8` array.
    unsafe { &*block.as_ptr().cast::<[u8; 128]>() }
}

/// Reinterpret a mutable 1024-bit block (`[u64; 16]`) as its 128 little-endian bytes.
#[inline]
#[must_use]
pub(crate) fn as_byte_array_mut(block: &mut [u64; 16]) -> &mut [u8; 128] {
    // SAFETY: `[u64; 16]` and `[u8; 128]` have identical size (128 bytes). Every bit
    // pattern is a valid `u8`, and the source is over-aligned for a `u8` array.
    unsafe { &mut *block.as_mut_ptr().cast::<[u8; 128]>() }
}

/// Whether the AVX-512 VBMI implementation should be used.
///
/// Resolved at runtime with the `std` feature, otherwise at compile time.
#[cfg(target_arch = "x86_64")]
#[inline]
fn detect_vbmi() -> bool {
    #[cfg(feature = "std")]
    {
        std::is_x86_feature_detected!("avx512vbmi")
            && std::is_x86_feature_detected!("avx512bw")
            && std::is_x86_feature_detected!("avx512f")
    }
    #[cfg(not(feature = "std"))]
    {
        cfg!(all(
            target_feature = "avx512vbmi",
            target_feature = "avx512bw",
            target_feature = "avx512f"
        ))
    }
}

/// Whether the BMI2 implementation should be used.
///
/// Resolved at runtime with the `std` feature, otherwise at compile time.
#[cfg(target_arch = "x86_64")]
#[inline]
fn detect_bmi2() -> bool {
    #[cfg(feature = "std")]
    {
        std::is_x86_feature_detected!("bmi2")
    }
    #[cfg(not(feature = "std"))]
    {
        cfg!(target_feature = "bmi2")
    }
}

/// Transpose 1024 bits into `FastLanes` layout, dispatching to the best implementation.
#[inline]
pub fn transpose_bits(input: &[u64; 16], output: &mut [u64; 16]) {
    #[cfg(target_arch = "x86_64")]
    {
        if detect_vbmi() {
            // SAFETY: guarded by `detect_vbmi`.
            unsafe { x86::transpose_bits_vbmi(input, output) }
        } else if detect_bmi2() {
            // SAFETY: guarded by `detect_bmi2`.
            unsafe { x86::transpose_bits_bmi2(input, output) }
        } else {
            scalar::transpose_bits(input, output);
        }
    }
    #[cfg(target_arch = "aarch64")]
    // SAFETY: NEON is always available on aarch64.
    unsafe {
        aarch64::transpose_bits_neon(input, output);
    }
    #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
    scalar::transpose_bits(input, output);
}

/// Untranspose a `T`-width comparison mask (1024 bits) from `FastLanes` layout into logical row
/// order, dispatching to the best implementation. For `T = u64` this is the canonical `FastLanes`
/// bit untranspose; narrower `T` undo the per-lane packing produced by `unpack_cmp` for that width.
#[inline]
pub fn untranspose_bits<T: crate::FastLanes>(input: &[u64; 16], output: &mut [u64; 16]) {
    #[cfg(target_arch = "x86_64")]
    {
        if detect_vbmi() {
            // SAFETY: guarded by `detect_vbmi`.
            unsafe { x86::untranspose_bits_vbmi::<T>(input, output) }
        } else if detect_bmi2() {
            // SAFETY: guarded by `detect_bmi2`.
            unsafe { x86::untranspose_bits_bmi2::<T>(input, output) }
        } else {
            scalar::untranspose_bits::<T>(input, output);
        }
    }
    #[cfg(target_arch = "aarch64")]
    // SAFETY: NEON is always available on aarch64.
    unsafe {
        aarch64::untranspose_bits_neon::<T>(input, output);
    }
    #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
    scalar::untranspose_bits::<T>(input, output);
}

#[cfg(test)]
pub(crate) fn generate_test_data(seed: u8) -> [u64; 16] {
    let mut data = [0u64; 16];
    for (i, byte) in as_byte_array_mut(&mut data).iter_mut().enumerate() {
        *byte = seed.wrapping_mul(17).wrapping_add(i as u8).wrapping_mul(31);
    }
    data
}

/// Reference transpose built directly on top of [`crate::transpose`], one bit at a time.
#[cfg(test)]
pub(crate) fn transpose_bits_baseline(input: &[u64; 16], output: &mut [u64; 16]) {
    let input = as_byte_array(input);
    let output = as_byte_array_mut(output);
    *output = [0u8; 128];
    for in_bit in 0..1024 {
        let out_bit = crate::transpose(in_bit);
        let bit_val = (input[in_bit / 8] >> (in_bit % 8)) & 1;
        output[out_bit / 8] |= bit_val << (out_bit % 8);
    }
}

/// Reference untranspose for a `T`-width comparison mask, one bit at a time.
///
/// Mask bit `b = lane * T::T + row` holds the comparison for logical index `index(row, lane)`
/// (the `unpack!` macro's formula), so we scatter each mask bit to its logical position. For
/// `T = u64` this is exactly the inverse permutation of [`transpose_bits_baseline`].
#[cfg(test)]
pub(crate) fn untranspose_bits_baseline<T: crate::FastLanes>(
    input: &[u64; 16],
    output: &mut [u64; 16],
) {
    let input = as_byte_array(input);
    let output = as_byte_array_mut(output);
    *output = [0u8; 128];
    for b in 0..1024 {
        let lane = b / T::T;
        let row = b % T::T;
        let logical = crate::FL_ORDER[row / 8] * 16 + (row % 8) * 128 + lane;
        let bit_val = (input[b / 8] >> (b % 8)) & 1;
        output[logical / 8] |= bit_val << (logical % 8);
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_baseline_roundtrip() {
        let input = generate_test_data(42);
        let mut transposed = [0u64; 16];
        let mut roundtrip = [0u64; 16];

        transpose_bits_baseline(&input, &mut transposed);
        untranspose_bits_baseline::<u64>(&transposed, &mut roundtrip);

        assert_eq!(input, roundtrip);
    }

    #[test]
    fn test_dispatch_matches_baseline() {
        for seed in [0, 42, 123, 255] {
            let input = generate_test_data(seed);
            let mut baseline_out = [0u64; 16];
            let mut out = [0u64; 16];

            transpose_bits_baseline(&input, &mut baseline_out);
            transpose_bits(&input, &mut out);

            assert_eq!(
                baseline_out, out,
                "transpose dispatch doesn't match baseline for seed {seed}"
            );
        }
    }

    #[test]
    fn test_untranspose_dispatch_matches_baseline() {
        fn check<T: crate::FastLanes>() {
            for seed in [0, 42, 123, 255] {
                let input = generate_test_data(seed);
                let mut baseline_out = [0u64; 16];
                let mut out = [0u64; 16];

                untranspose_bits_baseline::<T>(&input, &mut baseline_out);
                untranspose_bits::<T>(&input, &mut out);

                assert_eq!(
                    baseline_out,
                    out,
                    "untranspose dispatch doesn't match baseline for type={} seed={seed}",
                    core::any::type_name::<T>()
                );
            }
        }
        check::<u8>();
        check::<u16>();
        check::<u32>();
        check::<u64>();
    }

    #[test]
    fn test_dispatch_roundtrip() {
        for seed in [0, 42, 123, 255] {
            let input = generate_test_data(seed);
            let mut transposed = [0u64; 16];
            let mut roundtrip = [0u64; 16];

            transpose_bits(&input, &mut transposed);
            untranspose_bits::<u64>(&transposed, &mut roundtrip);

            assert_eq!(
                input, roundtrip,
                "dispatch roundtrip failed for seed {seed}"
            );
        }
    }

    /// The shared `group_perm` gather/scatter tables are consumed only by the SIMD kernels
    /// (NEON on `aarch64`, VBMI on `x86_64`) — so on a given host most of them are never executed
    /// (e.g. on an x86 CI runner without AVX-512VBMI nothing touches them, since BMI2 computes
    /// its indices inline). These two tests exercise the tables directly, on every architecture,
    /// independent of any SIMD support.
    #[cfg(any(target_arch = "x86_64", target_arch = "aarch64"))]
    mod group_perm_tables {
        use super::*;

        /// Portable scalar re-implementation of the SIMD width-generic untranspose, driven purely
        /// by the shared tables: gather the 16 byte-groups into group-major order, 8x8-transpose
        /// each group, then scatter back. Equivalent to the NEON/VBMI kernels but in plain Rust,
        /// so it validates the table *data* against the bit-level baseline on any host.
        fn untranspose_via_tables<T: crate::FastLanes>(input: &[u64; 16]) -> [u64; 16] {
            fn transpose_8x8(mut x: u64) -> u64 {
                let t = (x ^ (x >> 7)) & TRANSPOSE_2X2;
                x = x ^ t ^ (t << 7);
                let t = (x ^ (x >> 14)) & TRANSPOSE_4X4;
                x = x ^ t ^ (t << 14);
                let t = (x ^ (x >> 28)) & TRANSPOSE_8X8;
                x ^ t ^ (t << 28)
            }

            let (gather, scatter) = group_perm::group_tables::<T>();
            let src = as_byte_array(input);

            // Gather: group-major order, `grouped[k] = src[gather[k]]`.
            let mut grouped = [0u8; 128];
            for k in 0..128 {
                grouped[k] = src[gather[k] as usize];
            }

            // 8x8 bit-transpose each of the 16 groups (8 bytes each).
            let mut transposed = [0u8; 128];
            for g in 0..16 {
                let mut word = 0u64;
                for b in 0..8 {
                    word |= u64::from(grouped[g * 8 + b]) << (b * 8);
                }
                let w = transpose_8x8(word);
                for b in 0..8 {
                    transposed[g * 8 + b] = (w >> (b * 8)) as u8;
                }
            }

            // Scatter: `out[k] = transposed[scatter[k]]`.
            let mut out = [0u64; 16];
            let dst = as_byte_array_mut(&mut out);
            for k in 0..128 {
                dst[k] = transposed[scatter[k] as usize];
            }
            out
        }

        #[test]
        fn tables_match_baseline_all_widths() {
            fn check<T: crate::FastLanes>() {
                for seed in [0, 1, 42, 123, 200, 255] {
                    let input = generate_test_data(seed);
                    let mut baseline = [0u64; 16];
                    untranspose_bits_baseline::<T>(&input, &mut baseline);
                    assert_eq!(
                        untranspose_via_tables::<T>(&input),
                        baseline,
                        "group_perm tables != baseline for type={} seed={seed}",
                        core::any::type_name::<T>()
                    );
                }
            }
            check::<u8>();
            check::<u16>();
            check::<u32>();
            check::<u64>();
        }

        /// Both tables must be permutations of `0..128` — every input byte is read exactly once
        /// (gather) and every output byte is written exactly once (scatter). A duplicated or
        /// dropped index would silently corrupt data, so assert bijectivity for every width.
        #[test]
        fn tables_are_permutations() {
            fn is_permutation(t: &[u8; 128]) -> bool {
                let mut seen = [false; 128];
                for &i in t {
                    if i as usize >= 128 || seen[i as usize] {
                        return false;
                    }
                    seen[i as usize] = true;
                }
                seen.iter().all(|&b| b)
            }
            fn check<T: crate::FastLanes>() {
                let (gather, scatter) = group_perm::group_tables::<T>();
                assert!(
                    is_permutation(gather),
                    "gather table for {} is not a permutation",
                    core::any::type_name::<T>()
                );
                assert!(
                    is_permutation(scatter),
                    "scatter table for {} is not a permutation",
                    core::any::type_name::<T>()
                );
            }
            check::<u8>();
            check::<u16>();
            check::<u32>();
            check::<u64>();
        }
    }
}