lance_bitpacking/
lib.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright The Lance Authors
3
4// NOTICE:
5// This file is a modification of the `fastlanes` crate: https://github.com/spiraldb/fastlanes
6// It is modified to allow a rust stable build
7//
8// The original code can be accessed at
9//      https://github.com/spiraldb/fastlanes/blob/8e0ff374f815d919d0c0ebdccf5ffd9e6dc7d663/src/bitpacking.rs
10//      https://github.com/spiraldb/fastlanes/blob/8e0ff374f815d919d0c0ebdccf5ffd9e6dc7d663/src/lib.rs
11//      https://github.com/spiraldb/fastlanes/blob/8e0ff374f815d919d0c0ebdccf5ffd9e6dc7d663/src/macros.rs
12//
13// The original code is licensed under the Apache Software License:
14// https://github.com/spiraldb/fastlanes/blob/8e0ff374f815d919d0c0ebdccf5ffd9e6dc7d663/LICENSE
15
16use arrayref::{array_mut_ref, array_ref};
17use core::mem::size_of;
18use paste::paste;
19
20pub const FL_ORDER: [usize; 8] = [0, 4, 2, 6, 1, 5, 3, 7];
21
22pub trait FastLanes: Sized + Copy {
23    const T: usize = size_of::<Self>() * 8;
24    const LANES: usize = 1024 / Self::T;
25}
26
27// Implement the trait for basic unsigned integer types
28impl FastLanes for u8 {}
29impl FastLanes for u16 {}
30impl FastLanes for u32 {}
31impl FastLanes for u64 {}
32
33macro_rules! pack {
34    ($T:ty, $W:expr, $packed:expr, $lane:expr, | $_1:tt $idx:ident | $($body:tt)*) => {
35        macro_rules! __kernel__ {( $_1 $idx:ident ) => ( $($body)* )}
36        {
37            use paste::paste;
38
39            // The number of bits of T.
40            const T: usize = <$T>::T;
41
42            #[inline(always)]
43            fn index(row: usize, lane: usize) -> usize {
44                let o = row / 8;
45                let s = row % 8;
46                (FL_ORDER[o] * 16) + (s * 128) + lane
47            }
48
49            if $W == 0 {
50                // Nothing to do if W is 0, since the packed array is zero bytes.
51            } else if $W == T {
52                // Special case for W=T, we can just copy the input value directly to the packed value.
53                paste!(seq_t!(row in $T {
54                    let idx = index(row, $lane);
55                    $packed[<$T>::LANES * row + $lane] = __kernel__!(idx);
56                }));
57            } else {
58                // A mask of W bits.
59                let mask: $T = (1 << $W) - 1;
60
61                // First we loop over each lane in the virtual 1024 bit word.
62                let mut tmp: $T = 0;
63
64                // Loop over each of the rows of the lane.
65                // Inlining this loop means all branches are known at compile time and
66                // the code is auto-vectorized for SIMD execution.
67                paste!(seq_t!(row in $T {
68                    let idx = index(row, $lane);
69                    let src = __kernel__!(idx);
70                    let src = src & mask;
71
72                    // Shift the src bits into their position in the tmp output variable.
73                    if row == 0 {
74                        tmp = src;
75                    } else {
76                        tmp |= src << (row * $W) % T;
77                    }
78
79                    // If the next packed position is after our current one, then we have filled
80                    // the current output and we can write the packed value.
81                    let curr_word: usize = (row * $W) / T;
82                    let next_word: usize = ((row + 1) * $W) / T;
83
84                    #[allow(unused_assignments)]
85                    if next_word > curr_word {
86                        $packed[<$T>::LANES * curr_word + $lane] = tmp;
87                        let remaining_bits: usize = ((row + 1) * $W) % T;
88                        // Keep the remaining bits for the next packed value.
89                        tmp = src >> $W - remaining_bits;
90                    }
91                }));
92            }
93        }
94    };
95}
96
97macro_rules! unpack {
98    ($T:ty, $W:expr, $packed:expr, $lane:expr, | $_1:tt $idx:ident, $_2:tt $elem:ident | $($body:tt)*) => {
99        macro_rules! __kernel__ {( $_1 $idx:ident, $_2 $elem:ident ) => ( $($body)* )}
100        {
101            use paste::paste;
102
103            // The number of bits of T.
104            const T: usize = <$T>::T;
105
106            #[inline(always)]
107            fn index(row: usize, lane: usize) -> usize {
108                let o = row / 8;
109                let s = row % 8;
110                (FL_ORDER[o] * 16) + (s * 128) + lane
111            }
112
113            if $W == 0 {
114                // Special case for W=0, we just need to zero the output.
115                // We'll still respect the iteration order in case the kernel has side effects.
116                paste!(seq_t!(row in $T {
117                    let idx = index(row, $lane);
118                    let zero: $T = 0;
119                    __kernel__!(idx, zero);
120                }));
121            } else if $W == T {
122                // Special case for W=T, we can just copy the packed value directly to the output.
123                paste!(seq_t!(row in $T {
124                    let idx = index(row, $lane);
125                    let src = $packed[<$T>::LANES * row + $lane];
126                    __kernel__!(idx, src);
127                }));
128            } else {
129                #[inline]
130                fn mask(width: usize) -> $T {
131                    if width == T { <$T>::MAX } else { (1 << (width % T)) - 1 }
132                }
133
134                let mut src: $T = $packed[$lane];
135                let mut tmp: $T;
136
137                paste!(seq_t!(row in $T {
138                    // Figure out the packed positions
139                    let curr_word: usize = (row * $W) / T;
140                    let next_word = ((row + 1) * $W) / T;
141
142                    let shift = (row * $W) % T;
143
144                    if next_word > curr_word {
145                        // Consume some bits from the curr packed input, the remainder are in the next
146                        // packed input value
147                        let remaining_bits = ((row + 1) * $W) % T;
148                        let current_bits = $W - remaining_bits;
149                        tmp = (src >> shift) & mask(current_bits);
150
151                        if next_word < $W {
152                            // Load the next packed value
153                            src = $packed[<$T>::LANES * next_word + $lane];
154                            // Consume the remaining bits from the next input value.
155                            tmp |= (src & mask(remaining_bits)) << current_bits;
156                        }
157                    } else {
158                        // Otherwise, just grab W bits from the src value
159                        tmp = (src >> shift) & mask($W);
160                    }
161
162                    // Write out the unpacked value
163                    let idx = index(row, $lane);
164                    __kernel__!(idx, tmp);
165                }));
166            }
167        }
168    };
169}
170
171// Macro for repeating a code block bit_size_of::<T> times.
172macro_rules! seq_t {
173    ($ident:ident in u8 $body:tt) => {seq_macro::seq!($ident in 0..8 $body)};
174    ($ident:ident in u16 $body:tt) => {seq_macro::seq!($ident in 0..16 $body)};
175    ($ident:ident in u32 $body:tt) => {seq_macro::seq!($ident in 0..32 $body)};
176    ($ident:ident in u64 $body:tt) => {seq_macro::seq!($ident in 0..64 $body)};
177}
178
179/// `BitPack` into a compile-time known bit-width.
180pub trait BitPacking: FastLanes {
181    /// Packs 1024 elements into `W` bits each, where `W` is runtime-known instead of
182    /// compile-time known.
183    ///
184    /// # Safety
185    /// The input slice must be of exactly length 1024. The output slice must be of length
186    /// `1024 * W / T`, where `T` is the bit-width of Self and `W` is the packed width.
187    /// These lengths are checked only with `debug_assert` (i.e., not checked on release builds).
188    unsafe fn unchecked_pack(width: usize, input: &[Self], output: &mut [Self]);
189
190    /// Unpacks 1024 elements from `W` bits each, where `W` is runtime-known instead of
191    /// compile-time known.
192    ///
193    /// # Safety
194    /// The input slice must be of length `1024 * W / T`, where `T` is the bit-width of Self and `W`
195    /// is the packed width. The output slice must be of exactly length 1024.
196    /// These lengths are checked only with `debug_assert` (i.e., not checked on release builds).
197    unsafe fn unchecked_unpack(width: usize, input: &[Self], output: &mut [Self]);
198}
199
200impl BitPacking for u8 {
201    unsafe fn unchecked_pack(width: usize, input: &[Self], output: &mut [Self]) {
202        let packed_len = 128 * width / size_of::<Self>();
203        debug_assert_eq!(
204            output.len(),
205            packed_len,
206            "Output buffer must be of size 1024 * W / T"
207        );
208        debug_assert_eq!(input.len(), 1024, "Input buffer must be of size 1024");
209        debug_assert!(
210            width <= Self::T,
211            "Width must be less than or equal to {}",
212            Self::T
213        );
214
215        match width {
216            0 => {
217                // Nothing to write when width is zero.
218            }
219            1 => pack_8_1(
220                array_ref![input, 0, 1024],
221                array_mut_ref![output, 0, 1024 / 8],
222            ),
223            2 => pack_8_2(
224                array_ref![input, 0, 1024],
225                array_mut_ref![output, 0, 1024 * 2 / 8],
226            ),
227            3 => pack_8_3(
228                array_ref![input, 0, 1024],
229                array_mut_ref![output, 0, 1024 * 3 / 8],
230            ),
231            4 => pack_8_4(
232                array_ref![input, 0, 1024],
233                array_mut_ref![output, 0, 1024 * 4 / 8],
234            ),
235            5 => pack_8_5(
236                array_ref![input, 0, 1024],
237                array_mut_ref![output, 0, 1024 * 5 / 8],
238            ),
239            6 => pack_8_6(
240                array_ref![input, 0, 1024],
241                array_mut_ref![output, 0, 1024 * 6 / 8],
242            ),
243            7 => pack_8_7(
244                array_ref![input, 0, 1024],
245                array_mut_ref![output, 0, 1024 * 7 / 8],
246            ),
247            8 => pack_8_8(
248                array_ref![input, 0, 1024],
249                array_mut_ref![output, 0, 1024 * 8 / 8],
250            ),
251
252            _ => unreachable!("Unsupported width: {}", width),
253        }
254    }
255
256    unsafe fn unchecked_unpack(width: usize, input: &[Self], output: &mut [Self]) {
257        let packed_len = 128 * width / size_of::<Self>();
258        debug_assert_eq!(
259            input.len(),
260            packed_len,
261            "Input buffer must be of size 1024 * W / T"
262        );
263        debug_assert_eq!(output.len(), 1024, "Output buffer must be of size 1024");
264        debug_assert!(
265            width <= Self::T,
266            "Width must be less than or equal to {}",
267            Self::T
268        );
269
270        match width {
271            0 => {
272                // A zero-width packed chunk implies all zeros.
273                output.fill(0);
274            }
275            1 => unpack_8_1(
276                array_ref![input, 0, 1024 / 8],
277                array_mut_ref![output, 0, 1024],
278            ),
279            2 => unpack_8_2(
280                array_ref![input, 0, 1024 * 2 / 8],
281                array_mut_ref![output, 0, 1024],
282            ),
283            3 => unpack_8_3(
284                array_ref![input, 0, 1024 * 3 / 8],
285                array_mut_ref![output, 0, 1024],
286            ),
287            4 => unpack_8_4(
288                array_ref![input, 0, 1024 * 4 / 8],
289                array_mut_ref![output, 0, 1024],
290            ),
291            5 => unpack_8_5(
292                array_ref![input, 0, 1024 * 5 / 8],
293                array_mut_ref![output, 0, 1024],
294            ),
295            6 => unpack_8_6(
296                array_ref![input, 0, 1024 * 6 / 8],
297                array_mut_ref![output, 0, 1024],
298            ),
299            7 => unpack_8_7(
300                array_ref![input, 0, 1024 * 7 / 8],
301                array_mut_ref![output, 0, 1024],
302            ),
303            8 => unpack_8_8(
304                array_ref![input, 0, 1024 * 8 / 8],
305                array_mut_ref![output, 0, 1024],
306            ),
307
308            _ => unreachable!("Unsupported width: {}", width),
309        }
310    }
311}
312
313impl BitPacking for u16 {
314    unsafe fn unchecked_pack(width: usize, input: &[Self], output: &mut [Self]) {
315        let packed_len = 128 * width / size_of::<Self>();
316        debug_assert_eq!(
317            output.len(),
318            packed_len,
319            "Output buffer must be of size 1024 * W / T"
320        );
321        debug_assert_eq!(input.len(), 1024, "Input buffer must be of size 1024");
322        debug_assert!(
323            width <= Self::T,
324            "Width must be less than or equal to {}",
325            Self::T
326        );
327
328        match width {
329            0 => {
330                // Nothing to write when width is zero.
331            }
332            1 => pack_16_1(
333                array_ref![input, 0, 1024],
334                array_mut_ref![output, 0, 1024 / 16],
335            ),
336            2 => pack_16_2(
337                array_ref![input, 0, 1024],
338                array_mut_ref![output, 0, 1024 * 2 / 16],
339            ),
340            3 => pack_16_3(
341                array_ref![input, 0, 1024],
342                array_mut_ref![output, 0, 1024 * 3 / 16],
343            ),
344            4 => pack_16_4(
345                array_ref![input, 0, 1024],
346                array_mut_ref![output, 0, 1024 * 4 / 16],
347            ),
348            5 => pack_16_5(
349                array_ref![input, 0, 1024],
350                array_mut_ref![output, 0, 1024 * 5 / 16],
351            ),
352            6 => pack_16_6(
353                array_ref![input, 0, 1024],
354                array_mut_ref![output, 0, 1024 * 6 / 16],
355            ),
356            7 => pack_16_7(
357                array_ref![input, 0, 1024],
358                array_mut_ref![output, 0, 1024 * 7 / 16],
359            ),
360            8 => pack_16_8(
361                array_ref![input, 0, 1024],
362                array_mut_ref![output, 0, 1024 * 8 / 16],
363            ),
364            9 => pack_16_9(
365                array_ref![input, 0, 1024],
366                array_mut_ref![output, 0, 1024 * 9 / 16],
367            ),
368
369            10 => pack_16_10(
370                array_ref![input, 0, 1024],
371                array_mut_ref![output, 0, 1024 * 10 / 16],
372            ),
373            11 => pack_16_11(
374                array_ref![input, 0, 1024],
375                array_mut_ref![output, 0, 1024 * 11 / 16],
376            ),
377            12 => pack_16_12(
378                array_ref![input, 0, 1024],
379                array_mut_ref![output, 0, 1024 * 12 / 16],
380            ),
381            13 => pack_16_13(
382                array_ref![input, 0, 1024],
383                array_mut_ref![output, 0, 1024 * 13 / 16],
384            ),
385            14 => pack_16_14(
386                array_ref![input, 0, 1024],
387                array_mut_ref![output, 0, 1024 * 14 / 16],
388            ),
389            15 => pack_16_15(
390                array_ref![input, 0, 1024],
391                array_mut_ref![output, 0, 1024 * 15 / 16],
392            ),
393            16 => pack_16_16(
394                array_ref![input, 0, 1024],
395                array_mut_ref![output, 0, 1024 * 16 / 16],
396            ),
397
398            _ => unreachable!("Unsupported width: {}", width),
399        }
400    }
401
402    unsafe fn unchecked_unpack(width: usize, input: &[Self], output: &mut [Self]) {
403        let packed_len = 128 * width / size_of::<Self>();
404        debug_assert_eq!(
405            input.len(),
406            packed_len,
407            "Input buffer must be of size 1024 * W / T"
408        );
409        debug_assert_eq!(output.len(), 1024, "Output buffer must be of size 1024");
410        debug_assert!(
411            width <= Self::T,
412            "Width must be less than or equal to {}",
413            Self::T
414        );
415
416        match width {
417            0 => {
418                output.fill(0);
419            }
420            1 => unpack_16_1(
421                array_ref![input, 0, 1024 / 16],
422                array_mut_ref![output, 0, 1024],
423            ),
424            2 => unpack_16_2(
425                array_ref![input, 0, 1024 * 2 / 16],
426                array_mut_ref![output, 0, 1024],
427            ),
428            3 => unpack_16_3(
429                array_ref![input, 0, 1024 * 3 / 16],
430                array_mut_ref![output, 0, 1024],
431            ),
432            4 => unpack_16_4(
433                array_ref![input, 0, 1024 * 4 / 16],
434                array_mut_ref![output, 0, 1024],
435            ),
436            5 => unpack_16_5(
437                array_ref![input, 0, 1024 * 5 / 16],
438                array_mut_ref![output, 0, 1024],
439            ),
440            6 => unpack_16_6(
441                array_ref![input, 0, 1024 * 6 / 16],
442                array_mut_ref![output, 0, 1024],
443            ),
444            7 => unpack_16_7(
445                array_ref![input, 0, 1024 * 7 / 16],
446                array_mut_ref![output, 0, 1024],
447            ),
448            8 => unpack_16_8(
449                array_ref![input, 0, 1024 * 8 / 16],
450                array_mut_ref![output, 0, 1024],
451            ),
452            9 => unpack_16_9(
453                array_ref![input, 0, 1024 * 9 / 16],
454                array_mut_ref![output, 0, 1024],
455            ),
456
457            10 => unpack_16_10(
458                array_ref![input, 0, 1024 * 10 / 16],
459                array_mut_ref![output, 0, 1024],
460            ),
461            11 => unpack_16_11(
462                array_ref![input, 0, 1024 * 11 / 16],
463                array_mut_ref![output, 0, 1024],
464            ),
465            12 => unpack_16_12(
466                array_ref![input, 0, 1024 * 12 / 16],
467                array_mut_ref![output, 0, 1024],
468            ),
469            13 => unpack_16_13(
470                array_ref![input, 0, 1024 * 13 / 16],
471                array_mut_ref![output, 0, 1024],
472            ),
473            14 => unpack_16_14(
474                array_ref![input, 0, 1024 * 14 / 16],
475                array_mut_ref![output, 0, 1024],
476            ),
477            15 => unpack_16_15(
478                array_ref![input, 0, 1024 * 15 / 16],
479                array_mut_ref![output, 0, 1024],
480            ),
481            16 => unpack_16_16(
482                array_ref![input, 0, 1024 * 16 / 16],
483                array_mut_ref![output, 0, 1024],
484            ),
485
486            _ => unreachable!("Unsupported width: {}", width),
487        }
488    }
489}
490
491impl BitPacking for u32 {
492    unsafe fn unchecked_pack(width: usize, input: &[Self], output: &mut [Self]) {
493        let packed_len = 128 * width / size_of::<Self>();
494        debug_assert_eq!(
495            output.len(),
496            packed_len,
497            "Output buffer must be of size 1024 * W / T"
498        );
499        debug_assert_eq!(input.len(), 1024, "Input buffer must be of size 1024");
500        debug_assert!(
501            width <= Self::T,
502            "Width must be less than or equal to {}",
503            Self::T
504        );
505
506        match width {
507            0 => {
508                // Nothing to write when width is zero.
509            }
510            1 => pack_32_1(
511                array_ref![input, 0, 1024],
512                array_mut_ref![output, 0, 1024 / 32],
513            ),
514            2 => pack_32_2(
515                array_ref![input, 0, 1024],
516                array_mut_ref![output, 0, 1024 * 2 / 32],
517            ),
518            3 => pack_32_3(
519                array_ref![input, 0, 1024],
520                array_mut_ref![output, 0, 1024 * 3 / 32],
521            ),
522            4 => pack_32_4(
523                array_ref![input, 0, 1024],
524                array_mut_ref![output, 0, 1024 * 4 / 32],
525            ),
526            5 => pack_32_5(
527                array_ref![input, 0, 1024],
528                array_mut_ref![output, 0, 1024 * 5 / 32],
529            ),
530            6 => pack_32_6(
531                array_ref![input, 0, 1024],
532                array_mut_ref![output, 0, 1024 * 6 / 32],
533            ),
534            7 => pack_32_7(
535                array_ref![input, 0, 1024],
536                array_mut_ref![output, 0, 1024 * 7 / 32],
537            ),
538            8 => pack_32_8(
539                array_ref![input, 0, 1024],
540                array_mut_ref![output, 0, 1024 * 8 / 32],
541            ),
542            9 => pack_32_9(
543                array_ref![input, 0, 1024],
544                array_mut_ref![output, 0, 1024 * 9 / 32],
545            ),
546
547            10 => pack_32_10(
548                array_ref![input, 0, 1024],
549                array_mut_ref![output, 0, 1024 * 10 / 32],
550            ),
551            11 => pack_32_11(
552                array_ref![input, 0, 1024],
553                array_mut_ref![output, 0, 1024 * 11 / 32],
554            ),
555            12 => pack_32_12(
556                array_ref![input, 0, 1024],
557                array_mut_ref![output, 0, 1024 * 12 / 32],
558            ),
559            13 => pack_32_13(
560                array_ref![input, 0, 1024],
561                array_mut_ref![output, 0, 1024 * 13 / 32],
562            ),
563            14 => pack_32_14(
564                array_ref![input, 0, 1024],
565                array_mut_ref![output, 0, 1024 * 14 / 32],
566            ),
567            15 => pack_32_15(
568                array_ref![input, 0, 1024],
569                array_mut_ref![output, 0, 1024 * 15 / 32],
570            ),
571            16 => pack_32_16(
572                array_ref![input, 0, 1024],
573                array_mut_ref![output, 0, 1024 * 16 / 32],
574            ),
575            17 => pack_32_17(
576                array_ref![input, 0, 1024],
577                array_mut_ref![output, 0, 1024 * 17 / 32],
578            ),
579            18 => pack_32_18(
580                array_ref![input, 0, 1024],
581                array_mut_ref![output, 0, 1024 * 18 / 32],
582            ),
583            19 => pack_32_19(
584                array_ref![input, 0, 1024],
585                array_mut_ref![output, 0, 1024 * 19 / 32],
586            ),
587
588            20 => pack_32_20(
589                array_ref![input, 0, 1024],
590                array_mut_ref![output, 0, 1024 * 20 / 32],
591            ),
592            21 => pack_32_21(
593                array_ref![input, 0, 1024],
594                array_mut_ref![output, 0, 1024 * 21 / 32],
595            ),
596            22 => pack_32_22(
597                array_ref![input, 0, 1024],
598                array_mut_ref![output, 0, 1024 * 22 / 32],
599            ),
600            23 => pack_32_23(
601                array_ref![input, 0, 1024],
602                array_mut_ref![output, 0, 1024 * 23 / 32],
603            ),
604            24 => pack_32_24(
605                array_ref![input, 0, 1024],
606                array_mut_ref![output, 0, 1024 * 24 / 32],
607            ),
608            25 => pack_32_25(
609                array_ref![input, 0, 1024],
610                array_mut_ref![output, 0, 1024 * 25 / 32],
611            ),
612            26 => pack_32_26(
613                array_ref![input, 0, 1024],
614                array_mut_ref![output, 0, 1024 * 26 / 32],
615            ),
616            27 => pack_32_27(
617                array_ref![input, 0, 1024],
618                array_mut_ref![output, 0, 1024 * 27 / 32],
619            ),
620            28 => pack_32_28(
621                array_ref![input, 0, 1024],
622                array_mut_ref![output, 0, 1024 * 28 / 32],
623            ),
624            29 => pack_32_29(
625                array_ref![input, 0, 1024],
626                array_mut_ref![output, 0, 1024 * 29 / 32],
627            ),
628
629            30 => pack_32_30(
630                array_ref![input, 0, 1024],
631                array_mut_ref![output, 0, 1024 * 30 / 32],
632            ),
633            31 => pack_32_31(
634                array_ref![input, 0, 1024],
635                array_mut_ref![output, 0, 1024 * 31 / 32],
636            ),
637            32 => pack_32_32(
638                array_ref![input, 0, 1024],
639                array_mut_ref![output, 0, 1024 * 32 / 32],
640            ),
641
642            _ => unreachable!("Unsupported width: {}", width),
643        }
644    }
645
646    unsafe fn unchecked_unpack(width: usize, input: &[Self], output: &mut [Self]) {
647        let packed_len = 128 * width / size_of::<Self>();
648        debug_assert_eq!(
649            input.len(),
650            packed_len,
651            "Input buffer must be of size 1024 * W / T"
652        );
653        debug_assert_eq!(output.len(), 1024, "Output buffer must be of size 1024");
654        debug_assert!(
655            width <= Self::T,
656            "Width must be less than or equal to {}",
657            Self::T
658        );
659
660        match width {
661            0 => {
662                output.fill(0);
663            }
664            1 => unpack_32_1(
665                array_ref![input, 0, 1024 / 32],
666                array_mut_ref![output, 0, 1024],
667            ),
668            2 => unpack_32_2(
669                array_ref![input, 0, 1024 * 2 / 32],
670                array_mut_ref![output, 0, 1024],
671            ),
672            3 => unpack_32_3(
673                array_ref![input, 0, 1024 * 3 / 32],
674                array_mut_ref![output, 0, 1024],
675            ),
676            4 => unpack_32_4(
677                array_ref![input, 0, 1024 * 4 / 32],
678                array_mut_ref![output, 0, 1024],
679            ),
680            5 => unpack_32_5(
681                array_ref![input, 0, 1024 * 5 / 32],
682                array_mut_ref![output, 0, 1024],
683            ),
684            6 => unpack_32_6(
685                array_ref![input, 0, 1024 * 6 / 32],
686                array_mut_ref![output, 0, 1024],
687            ),
688            7 => unpack_32_7(
689                array_ref![input, 0, 1024 * 7 / 32],
690                array_mut_ref![output, 0, 1024],
691            ),
692            8 => unpack_32_8(
693                array_ref![input, 0, 1024 * 8 / 32],
694                array_mut_ref![output, 0, 1024],
695            ),
696            9 => unpack_32_9(
697                array_ref![input, 0, 1024 * 9 / 32],
698                array_mut_ref![output, 0, 1024],
699            ),
700
701            10 => unpack_32_10(
702                array_ref![input, 0, 1024 * 10 / 32],
703                array_mut_ref![output, 0, 1024],
704            ),
705            11 => unpack_32_11(
706                array_ref![input, 0, 1024 * 11 / 32],
707                array_mut_ref![output, 0, 1024],
708            ),
709            12 => unpack_32_12(
710                array_ref![input, 0, 1024 * 12 / 32],
711                array_mut_ref![output, 0, 1024],
712            ),
713            13 => unpack_32_13(
714                array_ref![input, 0, 1024 * 13 / 32],
715                array_mut_ref![output, 0, 1024],
716            ),
717            14 => unpack_32_14(
718                array_ref![input, 0, 1024 * 14 / 32],
719                array_mut_ref![output, 0, 1024],
720            ),
721            15 => unpack_32_15(
722                array_ref![input, 0, 1024 * 15 / 32],
723                array_mut_ref![output, 0, 1024],
724            ),
725            16 => unpack_32_16(
726                array_ref![input, 0, 1024 * 16 / 32],
727                array_mut_ref![output, 0, 1024],
728            ),
729            17 => unpack_32_17(
730                array_ref![input, 0, 1024 * 17 / 32],
731                array_mut_ref![output, 0, 1024],
732            ),
733            18 => unpack_32_18(
734                array_ref![input, 0, 1024 * 18 / 32],
735                array_mut_ref![output, 0, 1024],
736            ),
737            19 => unpack_32_19(
738                array_ref![input, 0, 1024 * 19 / 32],
739                array_mut_ref![output, 0, 1024],
740            ),
741
742            20 => unpack_32_20(
743                array_ref![input, 0, 1024 * 20 / 32],
744                array_mut_ref![output, 0, 1024],
745            ),
746            21 => unpack_32_21(
747                array_ref![input, 0, 1024 * 21 / 32],
748                array_mut_ref![output, 0, 1024],
749            ),
750            22 => unpack_32_22(
751                array_ref![input, 0, 1024 * 22 / 32],
752                array_mut_ref![output, 0, 1024],
753            ),
754            23 => unpack_32_23(
755                array_ref![input, 0, 1024 * 23 / 32],
756                array_mut_ref![output, 0, 1024],
757            ),
758            24 => unpack_32_24(
759                array_ref![input, 0, 1024 * 24 / 32],
760                array_mut_ref![output, 0, 1024],
761            ),
762            25 => unpack_32_25(
763                array_ref![input, 0, 1024 * 25 / 32],
764                array_mut_ref![output, 0, 1024],
765            ),
766            26 => unpack_32_26(
767                array_ref![input, 0, 1024 * 26 / 32],
768                array_mut_ref![output, 0, 1024],
769            ),
770            27 => unpack_32_27(
771                array_ref![input, 0, 1024 * 27 / 32],
772                array_mut_ref![output, 0, 1024],
773            ),
774            28 => unpack_32_28(
775                array_ref![input, 0, 1024 * 28 / 32],
776                array_mut_ref![output, 0, 1024],
777            ),
778            29 => unpack_32_29(
779                array_ref![input, 0, 1024 * 29 / 32],
780                array_mut_ref![output, 0, 1024],
781            ),
782
783            30 => unpack_32_30(
784                array_ref![input, 0, 1024 * 30 / 32],
785                array_mut_ref![output, 0, 1024],
786            ),
787            31 => unpack_32_31(
788                array_ref![input, 0, 1024 * 31 / 32],
789                array_mut_ref![output, 0, 1024],
790            ),
791            32 => unpack_32_32(
792                array_ref![input, 0, 1024 * 32 / 32],
793                array_mut_ref![output, 0, 1024],
794            ),
795
796            _ => unreachable!("Unsupported width: {}", width),
797        }
798    }
799}
800
801impl BitPacking for u64 {
802    unsafe fn unchecked_pack(width: usize, input: &[Self], output: &mut [Self]) {
803        let packed_len = 128 * width / size_of::<Self>();
804        debug_assert_eq!(
805            output.len(),
806            packed_len,
807            "Output buffer must be of size 1024 * W / T"
808        );
809        debug_assert_eq!(input.len(), 1024, "Input buffer must be of size 1024");
810        debug_assert!(
811            width <= Self::T,
812            "Width must be less than or equal to {}",
813            Self::T
814        );
815
816        match width {
817            0 => {
818                // Nothing to write when width is zero.
819            }
820            1 => pack_64_1(
821                array_ref![input, 0, 1024],
822                array_mut_ref![output, 0, 1024 / 64],
823            ),
824            2 => pack_64_2(
825                array_ref![input, 0, 1024],
826                array_mut_ref![output, 0, 1024 * 2 / 64],
827            ),
828            3 => pack_64_3(
829                array_ref![input, 0, 1024],
830                array_mut_ref![output, 0, 1024 * 3 / 64],
831            ),
832            4 => pack_64_4(
833                array_ref![input, 0, 1024],
834                array_mut_ref![output, 0, 1024 * 4 / 64],
835            ),
836            5 => pack_64_5(
837                array_ref![input, 0, 1024],
838                array_mut_ref![output, 0, 1024 * 5 / 64],
839            ),
840            6 => pack_64_6(
841                array_ref![input, 0, 1024],
842                array_mut_ref![output, 0, 1024 * 6 / 64],
843            ),
844            7 => pack_64_7(
845                array_ref![input, 0, 1024],
846                array_mut_ref![output, 0, 1024 * 7 / 64],
847            ),
848            8 => pack_64_8(
849                array_ref![input, 0, 1024],
850                array_mut_ref![output, 0, 1024 * 8 / 64],
851            ),
852            9 => pack_64_9(
853                array_ref![input, 0, 1024],
854                array_mut_ref![output, 0, 1024 * 9 / 64],
855            ),
856
857            10 => pack_64_10(
858                array_ref![input, 0, 1024],
859                array_mut_ref![output, 0, 1024 * 10 / 64],
860            ),
861            11 => pack_64_11(
862                array_ref![input, 0, 1024],
863                array_mut_ref![output, 0, 1024 * 11 / 64],
864            ),
865            12 => pack_64_12(
866                array_ref![input, 0, 1024],
867                array_mut_ref![output, 0, 1024 * 12 / 64],
868            ),
869            13 => pack_64_13(
870                array_ref![input, 0, 1024],
871                array_mut_ref![output, 0, 1024 * 13 / 64],
872            ),
873            14 => pack_64_14(
874                array_ref![input, 0, 1024],
875                array_mut_ref![output, 0, 1024 * 14 / 64],
876            ),
877            15 => pack_64_15(
878                array_ref![input, 0, 1024],
879                array_mut_ref![output, 0, 1024 * 15 / 64],
880            ),
881            16 => pack_64_16(
882                array_ref![input, 0, 1024],
883                array_mut_ref![output, 0, 1024 * 16 / 64],
884            ),
885            17 => pack_64_17(
886                array_ref![input, 0, 1024],
887                array_mut_ref![output, 0, 1024 * 17 / 64],
888            ),
889            18 => pack_64_18(
890                array_ref![input, 0, 1024],
891                array_mut_ref![output, 0, 1024 * 18 / 64],
892            ),
893            19 => pack_64_19(
894                array_ref![input, 0, 1024],
895                array_mut_ref![output, 0, 1024 * 19 / 64],
896            ),
897
898            20 => pack_64_20(
899                array_ref![input, 0, 1024],
900                array_mut_ref![output, 0, 1024 * 20 / 64],
901            ),
902            21 => pack_64_21(
903                array_ref![input, 0, 1024],
904                array_mut_ref![output, 0, 1024 * 21 / 64],
905            ),
906            22 => pack_64_22(
907                array_ref![input, 0, 1024],
908                array_mut_ref![output, 0, 1024 * 22 / 64],
909            ),
910            23 => pack_64_23(
911                array_ref![input, 0, 1024],
912                array_mut_ref![output, 0, 1024 * 23 / 64],
913            ),
914            24 => pack_64_24(
915                array_ref![input, 0, 1024],
916                array_mut_ref![output, 0, 1024 * 24 / 64],
917            ),
918            25 => pack_64_25(
919                array_ref![input, 0, 1024],
920                array_mut_ref![output, 0, 1024 * 25 / 64],
921            ),
922            26 => pack_64_26(
923                array_ref![input, 0, 1024],
924                array_mut_ref![output, 0, 1024 * 26 / 64],
925            ),
926            27 => pack_64_27(
927                array_ref![input, 0, 1024],
928                array_mut_ref![output, 0, 1024 * 27 / 64],
929            ),
930            28 => pack_64_28(
931                array_ref![input, 0, 1024],
932                array_mut_ref![output, 0, 1024 * 28 / 64],
933            ),
934            29 => pack_64_29(
935                array_ref![input, 0, 1024],
936                array_mut_ref![output, 0, 1024 * 29 / 64],
937            ),
938
939            30 => pack_64_30(
940                array_ref![input, 0, 1024],
941                array_mut_ref![output, 0, 1024 * 30 / 64],
942            ),
943            31 => pack_64_31(
944                array_ref![input, 0, 1024],
945                array_mut_ref![output, 0, 1024 * 31 / 64],
946            ),
947            32 => pack_64_32(
948                array_ref![input, 0, 1024],
949                array_mut_ref![output, 0, 1024 * 32 / 64],
950            ),
951            33 => pack_64_33(
952                array_ref![input, 0, 1024],
953                array_mut_ref![output, 0, 1024 * 33 / 64],
954            ),
955            34 => pack_64_34(
956                array_ref![input, 0, 1024],
957                array_mut_ref![output, 0, 1024 * 34 / 64],
958            ),
959            35 => pack_64_35(
960                array_ref![input, 0, 1024],
961                array_mut_ref![output, 0, 1024 * 35 / 64],
962            ),
963            36 => pack_64_36(
964                array_ref![input, 0, 1024],
965                array_mut_ref![output, 0, 1024 * 36 / 64],
966            ),
967            37 => pack_64_37(
968                array_ref![input, 0, 1024],
969                array_mut_ref![output, 0, 1024 * 37 / 64],
970            ),
971            38 => pack_64_38(
972                array_ref![input, 0, 1024],
973                array_mut_ref![output, 0, 1024 * 38 / 64],
974            ),
975            39 => pack_64_39(
976                array_ref![input, 0, 1024],
977                array_mut_ref![output, 0, 1024 * 39 / 64],
978            ),
979
980            40 => pack_64_40(
981                array_ref![input, 0, 1024],
982                array_mut_ref![output, 0, 1024 * 40 / 64],
983            ),
984            41 => pack_64_41(
985                array_ref![input, 0, 1024],
986                array_mut_ref![output, 0, 1024 * 41 / 64],
987            ),
988            42 => pack_64_42(
989                array_ref![input, 0, 1024],
990                array_mut_ref![output, 0, 1024 * 42 / 64],
991            ),
992            43 => pack_64_43(
993                array_ref![input, 0, 1024],
994                array_mut_ref![output, 0, 1024 * 43 / 64],
995            ),
996            44 => pack_64_44(
997                array_ref![input, 0, 1024],
998                array_mut_ref![output, 0, 1024 * 44 / 64],
999            ),
1000            45 => pack_64_45(
1001                array_ref![input, 0, 1024],
1002                array_mut_ref![output, 0, 1024 * 45 / 64],
1003            ),
1004            46 => pack_64_46(
1005                array_ref![input, 0, 1024],
1006                array_mut_ref![output, 0, 1024 * 46 / 64],
1007            ),
1008            47 => pack_64_47(
1009                array_ref![input, 0, 1024],
1010                array_mut_ref![output, 0, 1024 * 47 / 64],
1011            ),
1012            48 => pack_64_48(
1013                array_ref![input, 0, 1024],
1014                array_mut_ref![output, 0, 1024 * 48 / 64],
1015            ),
1016            49 => pack_64_49(
1017                array_ref![input, 0, 1024],
1018                array_mut_ref![output, 0, 1024 * 49 / 64],
1019            ),
1020
1021            50 => pack_64_50(
1022                array_ref![input, 0, 1024],
1023                array_mut_ref![output, 0, 1024 * 50 / 64],
1024            ),
1025            51 => pack_64_51(
1026                array_ref![input, 0, 1024],
1027                array_mut_ref![output, 0, 1024 * 51 / 64],
1028            ),
1029            52 => pack_64_52(
1030                array_ref![input, 0, 1024],
1031                array_mut_ref![output, 0, 1024 * 52 / 64],
1032            ),
1033            53 => pack_64_53(
1034                array_ref![input, 0, 1024],
1035                array_mut_ref![output, 0, 1024 * 53 / 64],
1036            ),
1037            54 => pack_64_54(
1038                array_ref![input, 0, 1024],
1039                array_mut_ref![output, 0, 1024 * 54 / 64],
1040            ),
1041            55 => pack_64_55(
1042                array_ref![input, 0, 1024],
1043                array_mut_ref![output, 0, 1024 * 55 / 64],
1044            ),
1045            56 => pack_64_56(
1046                array_ref![input, 0, 1024],
1047                array_mut_ref![output, 0, 1024 * 56 / 64],
1048            ),
1049            57 => pack_64_57(
1050                array_ref![input, 0, 1024],
1051                array_mut_ref![output, 0, 1024 * 57 / 64],
1052            ),
1053            58 => pack_64_58(
1054                array_ref![input, 0, 1024],
1055                array_mut_ref![output, 0, 1024 * 58 / 64],
1056            ),
1057            59 => pack_64_59(
1058                array_ref![input, 0, 1024],
1059                array_mut_ref![output, 0, 1024 * 59 / 64],
1060            ),
1061
1062            60 => pack_64_60(
1063                array_ref![input, 0, 1024],
1064                array_mut_ref![output, 0, 1024 * 60 / 64],
1065            ),
1066            61 => pack_64_61(
1067                array_ref![input, 0, 1024],
1068                array_mut_ref![output, 0, 1024 * 61 / 64],
1069            ),
1070            62 => pack_64_62(
1071                array_ref![input, 0, 1024],
1072                array_mut_ref![output, 0, 1024 * 62 / 64],
1073            ),
1074            63 => pack_64_63(
1075                array_ref![input, 0, 1024],
1076                array_mut_ref![output, 0, 1024 * 63 / 64],
1077            ),
1078            64 => pack_64_64(
1079                array_ref![input, 0, 1024],
1080                array_mut_ref![output, 0, 1024 * 64 / 64],
1081            ),
1082
1083            _ => unreachable!("Unsupported width: {}", width),
1084        }
1085    }
1086
1087    unsafe fn unchecked_unpack(width: usize, input: &[Self], output: &mut [Self]) {
1088        let packed_len = 128 * width / size_of::<Self>();
1089        debug_assert_eq!(
1090            input.len(),
1091            packed_len,
1092            "Input buffer must be of size 1024 * W / T"
1093        );
1094        debug_assert_eq!(output.len(), 1024, "Output buffer must be of size 1024");
1095        debug_assert!(
1096            width <= Self::T,
1097            "Width must be less than or equal to {}",
1098            Self::T
1099        );
1100
1101        match width {
1102            0 => {
1103                output.fill(0);
1104            }
1105            1 => unpack_64_1(
1106                array_ref![input, 0, 1024 / 64],
1107                array_mut_ref![output, 0, 1024],
1108            ),
1109            2 => unpack_64_2(
1110                array_ref![input, 0, 1024 * 2 / 64],
1111                array_mut_ref![output, 0, 1024],
1112            ),
1113            3 => unpack_64_3(
1114                array_ref![input, 0, 1024 * 3 / 64],
1115                array_mut_ref![output, 0, 1024],
1116            ),
1117            4 => unpack_64_4(
1118                array_ref![input, 0, 1024 * 4 / 64],
1119                array_mut_ref![output, 0, 1024],
1120            ),
1121            5 => unpack_64_5(
1122                array_ref![input, 0, 1024 * 5 / 64],
1123                array_mut_ref![output, 0, 1024],
1124            ),
1125            6 => unpack_64_6(
1126                array_ref![input, 0, 1024 * 6 / 64],
1127                array_mut_ref![output, 0, 1024],
1128            ),
1129            7 => unpack_64_7(
1130                array_ref![input, 0, 1024 * 7 / 64],
1131                array_mut_ref![output, 0, 1024],
1132            ),
1133            8 => unpack_64_8(
1134                array_ref![input, 0, 1024 * 8 / 64],
1135                array_mut_ref![output, 0, 1024],
1136            ),
1137            9 => unpack_64_9(
1138                array_ref![input, 0, 1024 * 9 / 64],
1139                array_mut_ref![output, 0, 1024],
1140            ),
1141
1142            10 => unpack_64_10(
1143                array_ref![input, 0, 1024 * 10 / 64],
1144                array_mut_ref![output, 0, 1024],
1145            ),
1146            11 => unpack_64_11(
1147                array_ref![input, 0, 1024 * 11 / 64],
1148                array_mut_ref![output, 0, 1024],
1149            ),
1150            12 => unpack_64_12(
1151                array_ref![input, 0, 1024 * 12 / 64],
1152                array_mut_ref![output, 0, 1024],
1153            ),
1154            13 => unpack_64_13(
1155                array_ref![input, 0, 1024 * 13 / 64],
1156                array_mut_ref![output, 0, 1024],
1157            ),
1158            14 => unpack_64_14(
1159                array_ref![input, 0, 1024 * 14 / 64],
1160                array_mut_ref![output, 0, 1024],
1161            ),
1162            15 => unpack_64_15(
1163                array_ref![input, 0, 1024 * 15 / 64],
1164                array_mut_ref![output, 0, 1024],
1165            ),
1166            16 => unpack_64_16(
1167                array_ref![input, 0, 1024 * 16 / 64],
1168                array_mut_ref![output, 0, 1024],
1169            ),
1170            17 => unpack_64_17(
1171                array_ref![input, 0, 1024 * 17 / 64],
1172                array_mut_ref![output, 0, 1024],
1173            ),
1174            18 => unpack_64_18(
1175                array_ref![input, 0, 1024 * 18 / 64],
1176                array_mut_ref![output, 0, 1024],
1177            ),
1178            19 => unpack_64_19(
1179                array_ref![input, 0, 1024 * 19 / 64],
1180                array_mut_ref![output, 0, 1024],
1181            ),
1182
1183            20 => unpack_64_20(
1184                array_ref![input, 0, 1024 * 20 / 64],
1185                array_mut_ref![output, 0, 1024],
1186            ),
1187            21 => unpack_64_21(
1188                array_ref![input, 0, 1024 * 21 / 64],
1189                array_mut_ref![output, 0, 1024],
1190            ),
1191            22 => unpack_64_22(
1192                array_ref![input, 0, 1024 * 22 / 64],
1193                array_mut_ref![output, 0, 1024],
1194            ),
1195            23 => unpack_64_23(
1196                array_ref![input, 0, 1024 * 23 / 64],
1197                array_mut_ref![output, 0, 1024],
1198            ),
1199            24 => unpack_64_24(
1200                array_ref![input, 0, 1024 * 24 / 64],
1201                array_mut_ref![output, 0, 1024],
1202            ),
1203            25 => unpack_64_25(
1204                array_ref![input, 0, 1024 * 25 / 64],
1205                array_mut_ref![output, 0, 1024],
1206            ),
1207            26 => unpack_64_26(
1208                array_ref![input, 0, 1024 * 26 / 64],
1209                array_mut_ref![output, 0, 1024],
1210            ),
1211            27 => unpack_64_27(
1212                array_ref![input, 0, 1024 * 27 / 64],
1213                array_mut_ref![output, 0, 1024],
1214            ),
1215            28 => unpack_64_28(
1216                array_ref![input, 0, 1024 * 28 / 64],
1217                array_mut_ref![output, 0, 1024],
1218            ),
1219            29 => unpack_64_29(
1220                array_ref![input, 0, 1024 * 29 / 64],
1221                array_mut_ref![output, 0, 1024],
1222            ),
1223
1224            30 => unpack_64_30(
1225                array_ref![input, 0, 1024 * 30 / 64],
1226                array_mut_ref![output, 0, 1024],
1227            ),
1228            31 => unpack_64_31(
1229                array_ref![input, 0, 1024 * 31 / 64],
1230                array_mut_ref![output, 0, 1024],
1231            ),
1232            32 => unpack_64_32(
1233                array_ref![input, 0, 1024 * 32 / 64],
1234                array_mut_ref![output, 0, 1024],
1235            ),
1236            33 => unpack_64_33(
1237                array_ref![input, 0, 1024 * 33 / 64],
1238                array_mut_ref![output, 0, 1024],
1239            ),
1240            34 => unpack_64_34(
1241                array_ref![input, 0, 1024 * 34 / 64],
1242                array_mut_ref![output, 0, 1024],
1243            ),
1244            35 => unpack_64_35(
1245                array_ref![input, 0, 1024 * 35 / 64],
1246                array_mut_ref![output, 0, 1024],
1247            ),
1248            36 => unpack_64_36(
1249                array_ref![input, 0, 1024 * 36 / 64],
1250                array_mut_ref![output, 0, 1024],
1251            ),
1252            37 => unpack_64_37(
1253                array_ref![input, 0, 1024 * 37 / 64],
1254                array_mut_ref![output, 0, 1024],
1255            ),
1256            38 => unpack_64_38(
1257                array_ref![input, 0, 1024 * 38 / 64],
1258                array_mut_ref![output, 0, 1024],
1259            ),
1260            39 => unpack_64_39(
1261                array_ref![input, 0, 1024 * 39 / 64],
1262                array_mut_ref![output, 0, 1024],
1263            ),
1264
1265            40 => unpack_64_40(
1266                array_ref![input, 0, 1024 * 40 / 64],
1267                array_mut_ref![output, 0, 1024],
1268            ),
1269            41 => unpack_64_41(
1270                array_ref![input, 0, 1024 * 41 / 64],
1271                array_mut_ref![output, 0, 1024],
1272            ),
1273            42 => unpack_64_42(
1274                array_ref![input, 0, 1024 * 42 / 64],
1275                array_mut_ref![output, 0, 1024],
1276            ),
1277            43 => unpack_64_43(
1278                array_ref![input, 0, 1024 * 43 / 64],
1279                array_mut_ref![output, 0, 1024],
1280            ),
1281            44 => unpack_64_44(
1282                array_ref![input, 0, 1024 * 44 / 64],
1283                array_mut_ref![output, 0, 1024],
1284            ),
1285            45 => unpack_64_45(
1286                array_ref![input, 0, 1024 * 45 / 64],
1287                array_mut_ref![output, 0, 1024],
1288            ),
1289            46 => unpack_64_46(
1290                array_ref![input, 0, 1024 * 46 / 64],
1291                array_mut_ref![output, 0, 1024],
1292            ),
1293            47 => unpack_64_47(
1294                array_ref![input, 0, 1024 * 47 / 64],
1295                array_mut_ref![output, 0, 1024],
1296            ),
1297            48 => unpack_64_48(
1298                array_ref![input, 0, 1024 * 48 / 64],
1299                array_mut_ref![output, 0, 1024],
1300            ),
1301            49 => unpack_64_49(
1302                array_ref![input, 0, 1024 * 49 / 64],
1303                array_mut_ref![output, 0, 1024],
1304            ),
1305
1306            50 => unpack_64_50(
1307                array_ref![input, 0, 1024 * 50 / 64],
1308                array_mut_ref![output, 0, 1024],
1309            ),
1310            51 => unpack_64_51(
1311                array_ref![input, 0, 1024 * 51 / 64],
1312                array_mut_ref![output, 0, 1024],
1313            ),
1314            52 => unpack_64_52(
1315                array_ref![input, 0, 1024 * 52 / 64],
1316                array_mut_ref![output, 0, 1024],
1317            ),
1318            53 => unpack_64_53(
1319                array_ref![input, 0, 1024 * 53 / 64],
1320                array_mut_ref![output, 0, 1024],
1321            ),
1322            54 => unpack_64_54(
1323                array_ref![input, 0, 1024 * 54 / 64],
1324                array_mut_ref![output, 0, 1024],
1325            ),
1326            55 => unpack_64_55(
1327                array_ref![input, 0, 1024 * 55 / 64],
1328                array_mut_ref![output, 0, 1024],
1329            ),
1330            56 => unpack_64_56(
1331                array_ref![input, 0, 1024 * 56 / 64],
1332                array_mut_ref![output, 0, 1024],
1333            ),
1334            57 => unpack_64_57(
1335                array_ref![input, 0, 1024 * 57 / 64],
1336                array_mut_ref![output, 0, 1024],
1337            ),
1338            58 => unpack_64_58(
1339                array_ref![input, 0, 1024 * 58 / 64],
1340                array_mut_ref![output, 0, 1024],
1341            ),
1342            59 => unpack_64_59(
1343                array_ref![input, 0, 1024 * 59 / 64],
1344                array_mut_ref![output, 0, 1024],
1345            ),
1346
1347            60 => unpack_64_60(
1348                array_ref![input, 0, 1024 * 60 / 64],
1349                array_mut_ref![output, 0, 1024],
1350            ),
1351            61 => unpack_64_61(
1352                array_ref![input, 0, 1024 * 61 / 64],
1353                array_mut_ref![output, 0, 1024],
1354            ),
1355            62 => unpack_64_62(
1356                array_ref![input, 0, 1024 * 62 / 64],
1357                array_mut_ref![output, 0, 1024],
1358            ),
1359            63 => unpack_64_63(
1360                array_ref![input, 0, 1024 * 63 / 64],
1361                array_mut_ref![output, 0, 1024],
1362            ),
1363            64 => unpack_64_64(
1364                array_ref![input, 0, 1024 * 64 / 64],
1365                array_mut_ref![output, 0, 1024],
1366            ),
1367
1368            _ => unreachable!("Unsupported width: {}", width),
1369        }
1370    }
1371}
1372
1373macro_rules! unpack_8 {
1374    ($name:ident, $bits:expr) => {
1375        fn $name(input: &[u8; 1024 * $bits / u8::T], output: &mut [u8; 1024]) {
1376            for lane in 0..u8::LANES {
1377                unpack!(u8, $bits, input, lane, |$idx, $elem| {
1378                    output[$idx] = $elem;
1379                });
1380            }
1381        }
1382    };
1383}
1384
1385unpack_8!(unpack_8_1, 1);
1386unpack_8!(unpack_8_2, 2);
1387unpack_8!(unpack_8_3, 3);
1388unpack_8!(unpack_8_4, 4);
1389unpack_8!(unpack_8_5, 5);
1390unpack_8!(unpack_8_6, 6);
1391unpack_8!(unpack_8_7, 7);
1392unpack_8!(unpack_8_8, 8);
1393
1394macro_rules! pack_8 {
1395    ($name:ident, $bits:expr) => {
1396        fn $name(input: &[u8; 1024], output: &mut [u8; 1024 * $bits / u8::T]) {
1397            for lane in 0..u8::LANES {
1398                pack!(u8, $bits, output, lane, |$idx| { input[$idx] });
1399            }
1400        }
1401    };
1402}
1403pack_8!(pack_8_1, 1);
1404pack_8!(pack_8_2, 2);
1405pack_8!(pack_8_3, 3);
1406pack_8!(pack_8_4, 4);
1407pack_8!(pack_8_5, 5);
1408pack_8!(pack_8_6, 6);
1409pack_8!(pack_8_7, 7);
1410pack_8!(pack_8_8, 8);
1411
1412macro_rules! unpack_16 {
1413    ($name:ident, $bits:expr) => {
1414        fn $name(input: &[u16; 1024 * $bits / u16::T], output: &mut [u16; 1024]) {
1415            for lane in 0..u16::LANES {
1416                unpack!(u16, $bits, input, lane, |$idx, $elem| {
1417                    output[$idx] = $elem;
1418                });
1419            }
1420        }
1421    };
1422}
1423
1424unpack_16!(unpack_16_1, 1);
1425unpack_16!(unpack_16_2, 2);
1426unpack_16!(unpack_16_3, 3);
1427unpack_16!(unpack_16_4, 4);
1428unpack_16!(unpack_16_5, 5);
1429unpack_16!(unpack_16_6, 6);
1430unpack_16!(unpack_16_7, 7);
1431unpack_16!(unpack_16_8, 8);
1432unpack_16!(unpack_16_9, 9);
1433unpack_16!(unpack_16_10, 10);
1434unpack_16!(unpack_16_11, 11);
1435unpack_16!(unpack_16_12, 12);
1436unpack_16!(unpack_16_13, 13);
1437unpack_16!(unpack_16_14, 14);
1438unpack_16!(unpack_16_15, 15);
1439unpack_16!(unpack_16_16, 16);
1440
1441macro_rules! pack_16 {
1442    ($name:ident, $bits:expr) => {
1443        fn $name(input: &[u16; 1024], output: &mut [u16; 1024 * $bits / u16::T]) {
1444            for lane in 0..u16::LANES {
1445                pack!(u16, $bits, output, lane, |$idx| { input[$idx] });
1446            }
1447        }
1448    };
1449}
1450
1451pack_16!(pack_16_1, 1);
1452pack_16!(pack_16_2, 2);
1453pack_16!(pack_16_3, 3);
1454pack_16!(pack_16_4, 4);
1455pack_16!(pack_16_5, 5);
1456pack_16!(pack_16_6, 6);
1457pack_16!(pack_16_7, 7);
1458pack_16!(pack_16_8, 8);
1459pack_16!(pack_16_9, 9);
1460pack_16!(pack_16_10, 10);
1461pack_16!(pack_16_11, 11);
1462pack_16!(pack_16_12, 12);
1463pack_16!(pack_16_13, 13);
1464pack_16!(pack_16_14, 14);
1465pack_16!(pack_16_15, 15);
1466pack_16!(pack_16_16, 16);
1467
1468macro_rules! unpack_32 {
1469    ($name:ident, $bit_width:expr) => {
1470        fn $name(input: &[u32; 1024 * $bit_width / u32::T], output: &mut [u32; 1024]) {
1471            for lane in 0..u32::LANES {
1472                unpack!(u32, $bit_width, input, lane, |$idx, $elem| {
1473                    output[$idx] = $elem
1474                });
1475            }
1476        }
1477    };
1478}
1479
1480unpack_32!(unpack_32_1, 1);
1481unpack_32!(unpack_32_2, 2);
1482unpack_32!(unpack_32_3, 3);
1483unpack_32!(unpack_32_4, 4);
1484unpack_32!(unpack_32_5, 5);
1485unpack_32!(unpack_32_6, 6);
1486unpack_32!(unpack_32_7, 7);
1487unpack_32!(unpack_32_8, 8);
1488unpack_32!(unpack_32_9, 9);
1489unpack_32!(unpack_32_10, 10);
1490unpack_32!(unpack_32_11, 11);
1491unpack_32!(unpack_32_12, 12);
1492unpack_32!(unpack_32_13, 13);
1493unpack_32!(unpack_32_14, 14);
1494unpack_32!(unpack_32_15, 15);
1495unpack_32!(unpack_32_16, 16);
1496unpack_32!(unpack_32_17, 17);
1497unpack_32!(unpack_32_18, 18);
1498unpack_32!(unpack_32_19, 19);
1499unpack_32!(unpack_32_20, 20);
1500unpack_32!(unpack_32_21, 21);
1501unpack_32!(unpack_32_22, 22);
1502unpack_32!(unpack_32_23, 23);
1503unpack_32!(unpack_32_24, 24);
1504unpack_32!(unpack_32_25, 25);
1505unpack_32!(unpack_32_26, 26);
1506unpack_32!(unpack_32_27, 27);
1507unpack_32!(unpack_32_28, 28);
1508unpack_32!(unpack_32_29, 29);
1509unpack_32!(unpack_32_30, 30);
1510unpack_32!(unpack_32_31, 31);
1511unpack_32!(unpack_32_32, 32);
1512
1513macro_rules! pack_32 {
1514    ($name:ident, $bits:expr) => {
1515        fn $name(input: &[u32; 1024], output: &mut [u32; 1024 * $bits / u32::BITS as usize]) {
1516            for lane in 0..u32::LANES {
1517                pack!(u32, $bits, output, lane, |$idx| { input[$idx] });
1518            }
1519        }
1520    };
1521}
1522
1523pack_32!(pack_32_1, 1);
1524pack_32!(pack_32_2, 2);
1525pack_32!(pack_32_3, 3);
1526pack_32!(pack_32_4, 4);
1527pack_32!(pack_32_5, 5);
1528pack_32!(pack_32_6, 6);
1529pack_32!(pack_32_7, 7);
1530pack_32!(pack_32_8, 8);
1531pack_32!(pack_32_9, 9);
1532pack_32!(pack_32_10, 10);
1533pack_32!(pack_32_11, 11);
1534pack_32!(pack_32_12, 12);
1535pack_32!(pack_32_13, 13);
1536pack_32!(pack_32_14, 14);
1537pack_32!(pack_32_15, 15);
1538pack_32!(pack_32_16, 16);
1539pack_32!(pack_32_17, 17);
1540pack_32!(pack_32_18, 18);
1541pack_32!(pack_32_19, 19);
1542pack_32!(pack_32_20, 20);
1543pack_32!(pack_32_21, 21);
1544pack_32!(pack_32_22, 22);
1545pack_32!(pack_32_23, 23);
1546pack_32!(pack_32_24, 24);
1547pack_32!(pack_32_25, 25);
1548pack_32!(pack_32_26, 26);
1549pack_32!(pack_32_27, 27);
1550pack_32!(pack_32_28, 28);
1551pack_32!(pack_32_29, 29);
1552pack_32!(pack_32_30, 30);
1553pack_32!(pack_32_31, 31);
1554pack_32!(pack_32_32, 32);
1555
1556macro_rules! unpack_64 {
1557    ($name:ident, $bit_width:expr) => {
1558        fn $name(input: &[u64; 1024 * $bit_width / u64::T], output: &mut [u64; 1024]) {
1559            for lane in 0..u64::LANES {
1560                unpack!(u64, $bit_width, input, lane, |$idx, $elem| {
1561                    output[$idx] = $elem
1562                });
1563            }
1564        }
1565    };
1566}
1567
1568unpack_64!(unpack_64_1, 1);
1569unpack_64!(unpack_64_2, 2);
1570unpack_64!(unpack_64_3, 3);
1571unpack_64!(unpack_64_4, 4);
1572unpack_64!(unpack_64_5, 5);
1573unpack_64!(unpack_64_6, 6);
1574unpack_64!(unpack_64_7, 7);
1575unpack_64!(unpack_64_8, 8);
1576unpack_64!(unpack_64_9, 9);
1577unpack_64!(unpack_64_10, 10);
1578unpack_64!(unpack_64_11, 11);
1579unpack_64!(unpack_64_12, 12);
1580unpack_64!(unpack_64_13, 13);
1581unpack_64!(unpack_64_14, 14);
1582unpack_64!(unpack_64_15, 15);
1583unpack_64!(unpack_64_16, 16);
1584unpack_64!(unpack_64_17, 17);
1585unpack_64!(unpack_64_18, 18);
1586unpack_64!(unpack_64_19, 19);
1587unpack_64!(unpack_64_20, 20);
1588unpack_64!(unpack_64_21, 21);
1589unpack_64!(unpack_64_22, 22);
1590unpack_64!(unpack_64_23, 23);
1591unpack_64!(unpack_64_24, 24);
1592unpack_64!(unpack_64_25, 25);
1593unpack_64!(unpack_64_26, 26);
1594unpack_64!(unpack_64_27, 27);
1595unpack_64!(unpack_64_28, 28);
1596unpack_64!(unpack_64_29, 29);
1597unpack_64!(unpack_64_30, 30);
1598unpack_64!(unpack_64_31, 31);
1599unpack_64!(unpack_64_32, 32);
1600
1601unpack_64!(unpack_64_33, 33);
1602unpack_64!(unpack_64_34, 34);
1603unpack_64!(unpack_64_35, 35);
1604unpack_64!(unpack_64_36, 36);
1605unpack_64!(unpack_64_37, 37);
1606unpack_64!(unpack_64_38, 38);
1607unpack_64!(unpack_64_39, 39);
1608unpack_64!(unpack_64_40, 40);
1609unpack_64!(unpack_64_41, 41);
1610unpack_64!(unpack_64_42, 42);
1611unpack_64!(unpack_64_43, 43);
1612unpack_64!(unpack_64_44, 44);
1613unpack_64!(unpack_64_45, 45);
1614unpack_64!(unpack_64_46, 46);
1615unpack_64!(unpack_64_47, 47);
1616unpack_64!(unpack_64_48, 48);
1617unpack_64!(unpack_64_49, 49);
1618unpack_64!(unpack_64_50, 50);
1619unpack_64!(unpack_64_51, 51);
1620unpack_64!(unpack_64_52, 52);
1621unpack_64!(unpack_64_53, 53);
1622unpack_64!(unpack_64_54, 54);
1623unpack_64!(unpack_64_55, 55);
1624unpack_64!(unpack_64_56, 56);
1625unpack_64!(unpack_64_57, 57);
1626unpack_64!(unpack_64_58, 58);
1627unpack_64!(unpack_64_59, 59);
1628unpack_64!(unpack_64_60, 60);
1629unpack_64!(unpack_64_61, 61);
1630unpack_64!(unpack_64_62, 62);
1631unpack_64!(unpack_64_63, 63);
1632unpack_64!(unpack_64_64, 64);
1633
1634macro_rules! pack_64 {
1635    ($name:ident, $bits:expr) => {
1636        fn $name(input: &[u64; 1024], output: &mut [u64; 1024 * $bits / u64::BITS as usize]) {
1637            for lane in 0..u64::LANES {
1638                pack!(u64, $bits, output, lane, |$idx| { input[$idx] });
1639            }
1640        }
1641    };
1642}
1643
1644pack_64!(pack_64_1, 1);
1645pack_64!(pack_64_2, 2);
1646pack_64!(pack_64_3, 3);
1647pack_64!(pack_64_4, 4);
1648pack_64!(pack_64_5, 5);
1649pack_64!(pack_64_6, 6);
1650pack_64!(pack_64_7, 7);
1651pack_64!(pack_64_8, 8);
1652pack_64!(pack_64_9, 9);
1653pack_64!(pack_64_10, 10);
1654pack_64!(pack_64_11, 11);
1655pack_64!(pack_64_12, 12);
1656pack_64!(pack_64_13, 13);
1657pack_64!(pack_64_14, 14);
1658pack_64!(pack_64_15, 15);
1659pack_64!(pack_64_16, 16);
1660pack_64!(pack_64_17, 17);
1661pack_64!(pack_64_18, 18);
1662pack_64!(pack_64_19, 19);
1663pack_64!(pack_64_20, 20);
1664pack_64!(pack_64_21, 21);
1665pack_64!(pack_64_22, 22);
1666pack_64!(pack_64_23, 23);
1667pack_64!(pack_64_24, 24);
1668pack_64!(pack_64_25, 25);
1669pack_64!(pack_64_26, 26);
1670pack_64!(pack_64_27, 27);
1671pack_64!(pack_64_28, 28);
1672pack_64!(pack_64_29, 29);
1673pack_64!(pack_64_30, 30);
1674pack_64!(pack_64_31, 31);
1675pack_64!(pack_64_32, 32);
1676
1677pack_64!(pack_64_33, 33);
1678pack_64!(pack_64_34, 34);
1679pack_64!(pack_64_35, 35);
1680pack_64!(pack_64_36, 36);
1681pack_64!(pack_64_37, 37);
1682pack_64!(pack_64_38, 38);
1683pack_64!(pack_64_39, 39);
1684pack_64!(pack_64_40, 40);
1685pack_64!(pack_64_41, 41);
1686pack_64!(pack_64_42, 42);
1687pack_64!(pack_64_43, 43);
1688pack_64!(pack_64_44, 44);
1689pack_64!(pack_64_45, 45);
1690pack_64!(pack_64_46, 46);
1691pack_64!(pack_64_47, 47);
1692pack_64!(pack_64_48, 48);
1693pack_64!(pack_64_49, 49);
1694pack_64!(pack_64_50, 50);
1695pack_64!(pack_64_51, 51);
1696pack_64!(pack_64_52, 52);
1697pack_64!(pack_64_53, 53);
1698pack_64!(pack_64_54, 54);
1699pack_64!(pack_64_55, 55);
1700pack_64!(pack_64_56, 56);
1701pack_64!(pack_64_57, 57);
1702pack_64!(pack_64_58, 58);
1703pack_64!(pack_64_59, 59);
1704pack_64!(pack_64_60, 60);
1705pack_64!(pack_64_61, 61);
1706pack_64!(pack_64_62, 62);
1707pack_64!(pack_64_63, 63);
1708pack_64!(pack_64_64, 64);
1709
1710#[cfg(test)]
1711mod test {
1712    use super::*;
1713    use core::array;
1714    // a fast random number generator
1715    pub struct XorShift {
1716        state: u64,
1717    }
1718
1719    impl XorShift {
1720        pub fn new(seed: u64) -> Self {
1721            Self { state: seed }
1722        }
1723
1724        pub fn next(&mut self) -> u64 {
1725            let mut x = self.state;
1726            x ^= x << 13;
1727            x ^= x >> 7;
1728            x ^= x << 17;
1729            self.state = x;
1730            x
1731        }
1732    }
1733
1734    // a macro version of this function generalize u8, u16, u32, u64 takes very long time for a test build, so I
1735    // write it for each type separately
1736    fn pack_unpack_u8(bit_width: usize) {
1737        let mut values: [u8; 1024] = [0; 1024];
1738        let mut rng = XorShift::new(123456789);
1739        for value in &mut values {
1740            *value = (rng.next() % (1 << bit_width)) as u8;
1741        }
1742
1743        let mut packed = vec![0; 1024 * bit_width / 8];
1744        for lane in 0..u8::LANES {
1745            // Always loop over lanes first. This is what the compiler vectorizes.
1746            pack!(u8, bit_width, packed, lane, |$pos| {
1747                values[$pos]
1748            });
1749        }
1750
1751        let mut unpacked: [u8; 1024] = [0; 1024];
1752        for lane in 0..u8::LANES {
1753            // Always loop over lanes first. This is what the compiler vectorizes.
1754            unpack!(u8, bit_width, packed, lane, |$idx, $elem| {
1755                unpacked[$idx] = $elem;
1756            });
1757        }
1758
1759        assert_eq!(values, unpacked);
1760    }
1761
1762    fn pack_unpack_u16(bit_width: usize) {
1763        let mut values: [u16; 1024] = [0; 1024];
1764        let mut rng = XorShift::new(123456789);
1765        for value in &mut values {
1766            *value = (rng.next() % (1 << bit_width)) as u16;
1767        }
1768
1769        let mut packed = vec![0; 1024 * bit_width / 16];
1770        for lane in 0..u16::LANES {
1771            // Always loop over lanes first. This is what the compiler vectorizes.
1772            pack!(u16, bit_width, packed, lane, |$pos| {
1773                values[$pos]
1774            });
1775        }
1776
1777        let mut unpacked: [u16; 1024] = [0; 1024];
1778        for lane in 0..u16::LANES {
1779            // Always loop over lanes first. This is what the compiler vectorizes.
1780            unpack!(u16, bit_width, packed, lane, |$idx, $elem| {
1781                unpacked[$idx] = $elem;
1782            });
1783        }
1784
1785        assert_eq!(values, unpacked);
1786    }
1787
1788    fn pack_unpack_u32(bit_width: usize) {
1789        let mut values: [u32; 1024] = [0; 1024];
1790        let mut rng = XorShift::new(123456789);
1791        for value in &mut values {
1792            *value = (rng.next() % (1 << bit_width)) as u32;
1793        }
1794
1795        let mut packed = vec![0; 1024 * bit_width / 32];
1796        for lane in 0..u32::LANES {
1797            // Always loop over lanes first. This is what the compiler vectorizes.
1798            pack!(u32, bit_width, packed, lane, |$pos| {
1799                values[$pos]
1800            });
1801        }
1802
1803        let mut unpacked: [u32; 1024] = [0; 1024];
1804        for lane in 0..u32::LANES {
1805            // Always loop over lanes first. This is what the compiler vectorizes.
1806            unpack!(u32, bit_width, packed, lane, |$idx, $elem| {
1807                unpacked[$idx] = $elem;
1808            });
1809        }
1810
1811        assert_eq!(values, unpacked);
1812    }
1813
1814    fn pack_unpack_u64(bit_width: usize) {
1815        let mut values: [u64; 1024] = [0; 1024];
1816        let mut rng = XorShift::new(123456789);
1817        if bit_width == 64 {
1818            for value in &mut values {
1819                *value = rng.next();
1820            }
1821        } else {
1822            for value in &mut values {
1823                *value = rng.next() % (1 << bit_width);
1824            }
1825        }
1826
1827        let mut packed = vec![0; 1024 * bit_width / 64];
1828        for lane in 0..u64::LANES {
1829            // Always loop over lanes first. This is what the compiler vectorizes.
1830            pack!(u64, bit_width, packed, lane, |$pos| {
1831                values[$pos]
1832            });
1833        }
1834
1835        let mut unpacked: [u64; 1024] = [0; 1024];
1836        for lane in 0..u64::LANES {
1837            // Always loop over lanes first. This is what the compiler vectorizes.
1838            unpack!(u64, bit_width, packed, lane, |$idx, $elem| {
1839                unpacked[$idx] = $elem;
1840            });
1841        }
1842
1843        assert_eq!(values, unpacked);
1844    }
1845
1846    #[test]
1847    fn test_pack() {
1848        pack_unpack_u8(0);
1849        pack_unpack_u8(1);
1850        pack_unpack_u8(2);
1851        pack_unpack_u8(3);
1852        pack_unpack_u8(4);
1853        pack_unpack_u8(5);
1854        pack_unpack_u8(6);
1855        pack_unpack_u8(7);
1856        pack_unpack_u8(8);
1857
1858        pack_unpack_u16(0);
1859        pack_unpack_u16(1);
1860        pack_unpack_u16(2);
1861        pack_unpack_u16(3);
1862        pack_unpack_u16(4);
1863        pack_unpack_u16(5);
1864        pack_unpack_u16(6);
1865        pack_unpack_u16(7);
1866        pack_unpack_u16(8);
1867        pack_unpack_u16(9);
1868        pack_unpack_u16(10);
1869        pack_unpack_u16(11);
1870        pack_unpack_u16(12);
1871        pack_unpack_u16(13);
1872        pack_unpack_u16(14);
1873        pack_unpack_u16(15);
1874        pack_unpack_u16(16);
1875
1876        pack_unpack_u32(0);
1877        pack_unpack_u32(1);
1878        pack_unpack_u32(2);
1879        pack_unpack_u32(3);
1880        pack_unpack_u32(4);
1881        pack_unpack_u32(5);
1882        pack_unpack_u32(6);
1883        pack_unpack_u32(7);
1884        pack_unpack_u32(8);
1885        pack_unpack_u32(9);
1886        pack_unpack_u32(10);
1887        pack_unpack_u32(11);
1888        pack_unpack_u32(12);
1889        pack_unpack_u32(13);
1890        pack_unpack_u32(14);
1891        pack_unpack_u32(15);
1892        pack_unpack_u32(16);
1893        pack_unpack_u32(17);
1894        pack_unpack_u32(18);
1895        pack_unpack_u32(19);
1896        pack_unpack_u32(20);
1897        pack_unpack_u32(21);
1898        pack_unpack_u32(22);
1899        pack_unpack_u32(23);
1900        pack_unpack_u32(24);
1901        pack_unpack_u32(25);
1902        pack_unpack_u32(26);
1903        pack_unpack_u32(27);
1904        pack_unpack_u32(28);
1905        pack_unpack_u32(29);
1906        pack_unpack_u32(30);
1907        pack_unpack_u32(31);
1908        pack_unpack_u32(32);
1909
1910        pack_unpack_u64(0);
1911        pack_unpack_u64(1);
1912        pack_unpack_u64(2);
1913        pack_unpack_u64(3);
1914        pack_unpack_u64(4);
1915        pack_unpack_u64(5);
1916        pack_unpack_u64(6);
1917        pack_unpack_u64(7);
1918        pack_unpack_u64(8);
1919        pack_unpack_u64(9);
1920        pack_unpack_u64(10);
1921        pack_unpack_u64(11);
1922        pack_unpack_u64(12);
1923        pack_unpack_u64(13);
1924        pack_unpack_u64(14);
1925        pack_unpack_u64(15);
1926        pack_unpack_u64(16);
1927        pack_unpack_u64(17);
1928        pack_unpack_u64(18);
1929        pack_unpack_u64(19);
1930        pack_unpack_u64(20);
1931        pack_unpack_u64(21);
1932        pack_unpack_u64(22);
1933        pack_unpack_u64(23);
1934        pack_unpack_u64(24);
1935        pack_unpack_u64(25);
1936        pack_unpack_u64(26);
1937        pack_unpack_u64(27);
1938        pack_unpack_u64(28);
1939        pack_unpack_u64(29);
1940        pack_unpack_u64(30);
1941        pack_unpack_u64(31);
1942        pack_unpack_u64(32);
1943        pack_unpack_u64(33);
1944        pack_unpack_u64(34);
1945        pack_unpack_u64(35);
1946        pack_unpack_u64(36);
1947        pack_unpack_u64(37);
1948        pack_unpack_u64(38);
1949        pack_unpack_u64(39);
1950        pack_unpack_u64(40);
1951        pack_unpack_u64(41);
1952        pack_unpack_u64(42);
1953        pack_unpack_u64(43);
1954        pack_unpack_u64(44);
1955        pack_unpack_u64(45);
1956        pack_unpack_u64(46);
1957        pack_unpack_u64(47);
1958        pack_unpack_u64(48);
1959        pack_unpack_u64(49);
1960        pack_unpack_u64(50);
1961        pack_unpack_u64(51);
1962        pack_unpack_u64(52);
1963        pack_unpack_u64(53);
1964        pack_unpack_u64(54);
1965        pack_unpack_u64(55);
1966        pack_unpack_u64(56);
1967        pack_unpack_u64(57);
1968        pack_unpack_u64(58);
1969        pack_unpack_u64(59);
1970        pack_unpack_u64(60);
1971        pack_unpack_u64(61);
1972        pack_unpack_u64(62);
1973        pack_unpack_u64(63);
1974        pack_unpack_u64(64);
1975    }
1976
1977    fn unchecked_pack_unpack_u8(bit_width: usize) {
1978        let mut values = [0u8; 1024];
1979        let mut rng = XorShift::new(123456789);
1980        for value in &mut values {
1981            *value = (rng.next() % (1 << bit_width)) as u8;
1982        }
1983        let mut packed = vec![0; 1024 * bit_width / 8];
1984        unsafe {
1985            BitPacking::unchecked_pack(bit_width, &values, &mut packed);
1986        }
1987        let mut output = [0; 1024];
1988        unsafe { BitPacking::unchecked_unpack(bit_width, &packed, &mut output) };
1989        assert_eq!(values, output);
1990    }
1991
1992    fn unchecked_pack_unpack_u16(bit_width: usize) {
1993        let mut values = [0u16; 1024];
1994        let mut rng = XorShift::new(123456789);
1995        for value in &mut values {
1996            *value = (rng.next() % (1 << bit_width)) as u16;
1997        }
1998        let mut packed = vec![0; 1024 * bit_width / u16::T];
1999        unsafe {
2000            BitPacking::unchecked_pack(bit_width, &values, &mut packed);
2001        }
2002        let mut output = [0; 1024];
2003        unsafe { BitPacking::unchecked_unpack(bit_width, &packed, &mut output) };
2004        assert_eq!(values, output);
2005    }
2006
2007    fn unchecked_pack_unpack_u32(bit_width: usize) {
2008        let mut values = [0u32; 1024];
2009        let mut rng = XorShift::new(123456789);
2010        for value in &mut values {
2011            *value = (rng.next() % (1 << bit_width)) as u32;
2012        }
2013        let mut packed = vec![0; 1024 * bit_width / u32::T];
2014        unsafe {
2015            BitPacking::unchecked_pack(bit_width, &values, &mut packed);
2016        }
2017        let mut output = [0; 1024];
2018        unsafe { BitPacking::unchecked_unpack(bit_width, &packed, &mut output) };
2019        assert_eq!(values, output);
2020    }
2021
2022    fn unchecked_pack_unpack_u64(bit_width: usize) {
2023        let mut values = [0u64; 1024];
2024        let mut rng = XorShift::new(123456789);
2025        if bit_width == 64 {
2026            for value in &mut values {
2027                *value = rng.next();
2028            }
2029        }
2030        let mut packed = vec![0; 1024 * bit_width / u64::T];
2031        unsafe {
2032            BitPacking::unchecked_pack(bit_width, &values, &mut packed);
2033        }
2034        let mut output = [0; 1024];
2035        unsafe { BitPacking::unchecked_unpack(bit_width, &packed, &mut output) };
2036        assert_eq!(values, output);
2037    }
2038
2039    #[test]
2040    fn test_unchecked_pack() {
2041        let input = array::from_fn(|i| i as u32);
2042        let mut packed = [0; 320];
2043        unsafe { BitPacking::unchecked_pack(10, &input, &mut packed) };
2044        let mut output = [0; 1024];
2045        unsafe { BitPacking::unchecked_unpack(10, &packed, &mut output) };
2046        assert_eq!(input, output);
2047
2048        unchecked_pack_unpack_u8(1);
2049        unchecked_pack_unpack_u8(2);
2050        unchecked_pack_unpack_u8(3);
2051        unchecked_pack_unpack_u8(4);
2052        unchecked_pack_unpack_u8(5);
2053        unchecked_pack_unpack_u8(6);
2054        unchecked_pack_unpack_u8(7);
2055        unchecked_pack_unpack_u8(8);
2056
2057        unchecked_pack_unpack_u16(1);
2058        unchecked_pack_unpack_u16(2);
2059        unchecked_pack_unpack_u16(3);
2060        unchecked_pack_unpack_u16(4);
2061        unchecked_pack_unpack_u16(5);
2062        unchecked_pack_unpack_u16(6);
2063        unchecked_pack_unpack_u16(7);
2064        unchecked_pack_unpack_u16(8);
2065        unchecked_pack_unpack_u16(9);
2066        unchecked_pack_unpack_u16(10);
2067        unchecked_pack_unpack_u16(11);
2068        unchecked_pack_unpack_u16(12);
2069        unchecked_pack_unpack_u16(13);
2070        unchecked_pack_unpack_u16(14);
2071        unchecked_pack_unpack_u16(15);
2072        unchecked_pack_unpack_u16(16);
2073
2074        unchecked_pack_unpack_u32(1);
2075        unchecked_pack_unpack_u32(2);
2076        unchecked_pack_unpack_u32(3);
2077        unchecked_pack_unpack_u32(4);
2078        unchecked_pack_unpack_u32(5);
2079        unchecked_pack_unpack_u32(6);
2080        unchecked_pack_unpack_u32(7);
2081        unchecked_pack_unpack_u32(8);
2082        unchecked_pack_unpack_u32(9);
2083        unchecked_pack_unpack_u32(10);
2084        unchecked_pack_unpack_u32(11);
2085        unchecked_pack_unpack_u32(12);
2086        unchecked_pack_unpack_u32(13);
2087        unchecked_pack_unpack_u32(14);
2088        unchecked_pack_unpack_u32(15);
2089        unchecked_pack_unpack_u32(16);
2090        unchecked_pack_unpack_u32(17);
2091        unchecked_pack_unpack_u32(18);
2092        unchecked_pack_unpack_u32(19);
2093        unchecked_pack_unpack_u32(20);
2094        unchecked_pack_unpack_u32(21);
2095        unchecked_pack_unpack_u32(22);
2096        unchecked_pack_unpack_u32(23);
2097        unchecked_pack_unpack_u32(24);
2098        unchecked_pack_unpack_u32(25);
2099        unchecked_pack_unpack_u32(26);
2100        unchecked_pack_unpack_u32(27);
2101        unchecked_pack_unpack_u32(28);
2102        unchecked_pack_unpack_u32(29);
2103        unchecked_pack_unpack_u32(30);
2104        unchecked_pack_unpack_u32(31);
2105        unchecked_pack_unpack_u32(32);
2106
2107        unchecked_pack_unpack_u64(1);
2108        unchecked_pack_unpack_u64(2);
2109        unchecked_pack_unpack_u64(3);
2110        unchecked_pack_unpack_u64(4);
2111        unchecked_pack_unpack_u64(5);
2112        unchecked_pack_unpack_u64(6);
2113        unchecked_pack_unpack_u64(7);
2114        unchecked_pack_unpack_u64(8);
2115        unchecked_pack_unpack_u64(9);
2116        unchecked_pack_unpack_u64(10);
2117        unchecked_pack_unpack_u64(11);
2118        unchecked_pack_unpack_u64(12);
2119        unchecked_pack_unpack_u64(13);
2120        unchecked_pack_unpack_u64(14);
2121        unchecked_pack_unpack_u64(15);
2122        unchecked_pack_unpack_u64(16);
2123        unchecked_pack_unpack_u64(17);
2124        unchecked_pack_unpack_u64(18);
2125        unchecked_pack_unpack_u64(19);
2126        unchecked_pack_unpack_u64(20);
2127        unchecked_pack_unpack_u64(21);
2128        unchecked_pack_unpack_u64(22);
2129        unchecked_pack_unpack_u64(23);
2130        unchecked_pack_unpack_u64(24);
2131        unchecked_pack_unpack_u64(25);
2132        unchecked_pack_unpack_u64(26);
2133        unchecked_pack_unpack_u64(27);
2134        unchecked_pack_unpack_u64(28);
2135        unchecked_pack_unpack_u64(29);
2136        unchecked_pack_unpack_u64(30);
2137        unchecked_pack_unpack_u64(31);
2138        unchecked_pack_unpack_u64(32);
2139        unchecked_pack_unpack_u64(33);
2140        unchecked_pack_unpack_u64(34);
2141        unchecked_pack_unpack_u64(35);
2142        unchecked_pack_unpack_u64(36);
2143        unchecked_pack_unpack_u64(37);
2144        unchecked_pack_unpack_u64(38);
2145        unchecked_pack_unpack_u64(39);
2146        unchecked_pack_unpack_u64(40);
2147        unchecked_pack_unpack_u64(41);
2148        unchecked_pack_unpack_u64(42);
2149        unchecked_pack_unpack_u64(43);
2150        unchecked_pack_unpack_u64(44);
2151        unchecked_pack_unpack_u64(45);
2152        unchecked_pack_unpack_u64(46);
2153        unchecked_pack_unpack_u64(47);
2154        unchecked_pack_unpack_u64(48);
2155        unchecked_pack_unpack_u64(49);
2156        unchecked_pack_unpack_u64(50);
2157        unchecked_pack_unpack_u64(51);
2158        unchecked_pack_unpack_u64(52);
2159        unchecked_pack_unpack_u64(53);
2160        unchecked_pack_unpack_u64(54);
2161        unchecked_pack_unpack_u64(55);
2162        unchecked_pack_unpack_u64(56);
2163        unchecked_pack_unpack_u64(57);
2164        unchecked_pack_unpack_u64(58);
2165        unchecked_pack_unpack_u64(59);
2166        unchecked_pack_unpack_u64(60);
2167        unchecked_pack_unpack_u64(61);
2168        unchecked_pack_unpack_u64(62);
2169        unchecked_pack_unpack_u64(63);
2170        unchecked_pack_unpack_u64(64);
2171    }
2172}