Skip to main content

lance_bitpacking/
lib.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright The Lance Authors
3
4// NOTICE:
5// This file is a modification of the `fastlanes` crate: https://github.com/spiraldb/fastlanes
6// It is modified to allow a rust stable build
7//
8// The original code can be accessed at
9//      https://github.com/spiraldb/fastlanes/blob/8e0ff374f815d919d0c0ebdccf5ffd9e6dc7d663/src/bitpacking.rs
10//      https://github.com/spiraldb/fastlanes/blob/8e0ff374f815d919d0c0ebdccf5ffd9e6dc7d663/src/lib.rs
11//      https://github.com/spiraldb/fastlanes/blob/8e0ff374f815d919d0c0ebdccf5ffd9e6dc7d663/src/macros.rs
12//
13// The original code is licensed under the Apache Software License:
14// https://github.com/spiraldb/fastlanes/blob/8e0ff374f815d919d0c0ebdccf5ffd9e6dc7d663/LICENSE
15
16use arrayref::{array_mut_ref, array_ref};
17use core::mem::size_of;
18
19pub const FL_ORDER: [usize; 8] = [0, 4, 2, 6, 1, 5, 3, 7];
20
21pub trait FastLanes: Sized + Copy {
22    const T: usize = size_of::<Self>() * 8;
23    const LANES: usize = 1024 / Self::T;
24}
25
26// Implement the trait for basic unsigned integer types
27impl FastLanes for u8 {}
28impl FastLanes for u16 {}
29impl FastLanes for u32 {}
30impl FastLanes for u64 {}
31
32macro_rules! pack {
33    ($T:ty, $W:expr, $packed:expr, $lane:expr, | $_1:tt $idx:ident | $($body:tt)*) => {
34        macro_rules! __kernel__ {( $_1 $idx:ident ) => ( $($body)* )}
35        {
36            use paste::paste;
37
38            // The number of bits of T.
39            const T: usize = <$T>::T;
40
41            #[inline(always)]
42            fn index(row: usize, lane: usize) -> usize {
43                let o = row / 8;
44                let s = row % 8;
45                (FL_ORDER[o] * 16) + (s * 128) + lane
46            }
47
48            if $W == 0 {
49                // Nothing to do if W is 0, since the packed array is zero bytes.
50            } else if $W == T {
51                // Special case for W=T, we can just copy the input value directly to the packed value.
52                paste!(seq_t!(row in $T {
53                    let idx = index(row, $lane);
54                    $packed[<$T>::LANES * row + $lane] = __kernel__!(idx);
55                }));
56            } else {
57                // A mask of W bits.
58                let mask: $T = (1 << $W) - 1;
59
60                // First we loop over each lane in the virtual 1024 bit word.
61                let mut tmp: $T = 0;
62
63                // Loop over each of the rows of the lane.
64                // Inlining this loop means all branches are known at compile time and
65                // the code is auto-vectorized for SIMD execution.
66                paste!(seq_t!(row in $T {
67                    let idx = index(row, $lane);
68                    let src = __kernel__!(idx);
69                    let src = src & mask;
70
71                    // Shift the src bits into their position in the tmp output variable.
72                    if row == 0 {
73                        tmp = src;
74                    } else {
75                        tmp |= src << (row * $W) % T;
76                    }
77
78                    // If the next packed position is after our current one, then we have filled
79                    // the current output and we can write the packed value.
80                    let curr_word: usize = (row * $W) / T;
81                    let next_word: usize = ((row + 1) * $W) / T;
82
83                    #[allow(unused_assignments)]
84                    if next_word > curr_word {
85                        $packed[<$T>::LANES * curr_word + $lane] = tmp;
86                        let remaining_bits: usize = ((row + 1) * $W) % T;
87                        // Keep the remaining bits for the next packed value.
88                        tmp = src >> $W - remaining_bits;
89                    }
90                }));
91            }
92        }
93    };
94}
95
96macro_rules! unpack {
97    ($T:ty, $W:expr, $packed:expr, $lane:expr, | $_1:tt $idx:ident, $_2:tt $elem:ident | $($body:tt)*) => {
98        macro_rules! __kernel__ {( $_1 $idx:ident, $_2 $elem:ident ) => ( $($body)* )}
99        {
100            use paste::paste;
101
102            // The number of bits of T.
103            const T: usize = <$T>::T;
104
105            #[inline(always)]
106            fn index(row: usize, lane: usize) -> usize {
107                let o = row / 8;
108                let s = row % 8;
109                (FL_ORDER[o] * 16) + (s * 128) + lane
110            }
111
112            if $W == 0 {
113                // Special case for W=0, we just need to zero the output.
114                // We'll still respect the iteration order in case the kernel has side effects.
115                paste!(seq_t!(row in $T {
116                    let idx = index(row, $lane);
117                    let zero: $T = 0;
118                    __kernel__!(idx, zero);
119                }));
120            } else if $W == T {
121                // Special case for W=T, we can just copy the packed value directly to the output.
122                paste!(seq_t!(row in $T {
123                    let idx = index(row, $lane);
124                    let src = $packed[<$T>::LANES * row + $lane];
125                    __kernel__!(idx, src);
126                }));
127            } else {
128                #[inline]
129                fn mask(width: usize) -> $T {
130                    if width == T { <$T>::MAX } else { (1 << (width % T)) - 1 }
131                }
132
133                let mut src: $T = $packed[$lane];
134                let mut tmp: $T;
135
136                paste!(seq_t!(row in $T {
137                    // Figure out the packed positions
138                    let curr_word: usize = (row * $W) / T;
139                    let next_word = ((row + 1) * $W) / T;
140
141                    let shift = (row * $W) % T;
142
143                    if next_word > curr_word {
144                        // Consume some bits from the curr packed input, the remainder are in the next
145                        // packed input value
146                        let remaining_bits = ((row + 1) * $W) % T;
147                        let current_bits = $W - remaining_bits;
148                        tmp = (src >> shift) & mask(current_bits);
149
150                        if next_word < $W {
151                            // Load the next packed value
152                            src = $packed[<$T>::LANES * next_word + $lane];
153                            // Consume the remaining bits from the next input value.
154                            tmp |= (src & mask(remaining_bits)) << current_bits;
155                        }
156                    } else {
157                        // Otherwise, just grab W bits from the src value
158                        tmp = (src >> shift) & mask($W);
159                    }
160
161                    // Write out the unpacked value
162                    let idx = index(row, $lane);
163                    __kernel__!(idx, tmp);
164                }));
165            }
166        }
167    };
168}
169
170// Macro for repeating a code block bit_size_of::<T> times.
171macro_rules! seq_t {
172    ($ident:ident in u8 $body:tt) => {seq_macro::seq!($ident in 0..8 $body)};
173    ($ident:ident in u16 $body:tt) => {seq_macro::seq!($ident in 0..16 $body)};
174    ($ident:ident in u32 $body:tt) => {seq_macro::seq!($ident in 0..32 $body)};
175    ($ident:ident in u64 $body:tt) => {seq_macro::seq!($ident in 0..64 $body)};
176}
177
178/// `BitPack` into a compile-time known bit-width.
179pub trait BitPacking: FastLanes {
180    /// Packs 1024 elements into `W` bits each, where `W` is runtime-known instead of
181    /// compile-time known.
182    ///
183    /// # Safety
184    /// The input slice must be of exactly length 1024. The output slice must be of length
185    /// `1024 * W / T`, where `T` is the bit-width of Self and `W` is the packed width.
186    /// These lengths are checked only with `debug_assert` (i.e., not checked on release builds).
187    unsafe fn unchecked_pack(width: usize, input: &[Self], output: &mut [Self]);
188
189    /// Unpacks 1024 elements from `W` bits each, where `W` is runtime-known instead of
190    /// compile-time known.
191    ///
192    /// # Safety
193    /// The input slice must be of length `1024 * W / T`, where `T` is the bit-width of Self and `W`
194    /// is the packed width. The output slice must be of exactly length 1024.
195    /// These lengths are checked only with `debug_assert` (i.e., not checked on release builds).
196    unsafe fn unchecked_unpack(width: usize, input: &[Self], output: &mut [Self]);
197}
198
199impl BitPacking for u8 {
200    unsafe fn unchecked_pack(width: usize, input: &[Self], output: &mut [Self]) {
201        let packed_len = 128 * width / size_of::<Self>();
202        debug_assert_eq!(
203            output.len(),
204            packed_len,
205            "Output buffer must be of size 1024 * W / T"
206        );
207        debug_assert_eq!(input.len(), 1024, "Input buffer must be of size 1024");
208        debug_assert!(
209            width <= Self::T,
210            "Width must be less than or equal to {}",
211            Self::T
212        );
213
214        match width {
215            0 => {
216                // Nothing to write when width is zero.
217            }
218            1 => pack_8_1(
219                array_ref![input, 0, 1024],
220                array_mut_ref![output, 0, 1024 / 8],
221            ),
222            2 => pack_8_2(
223                array_ref![input, 0, 1024],
224                array_mut_ref![output, 0, 1024 * 2 / 8],
225            ),
226            3 => pack_8_3(
227                array_ref![input, 0, 1024],
228                array_mut_ref![output, 0, 1024 * 3 / 8],
229            ),
230            4 => pack_8_4(
231                array_ref![input, 0, 1024],
232                array_mut_ref![output, 0, 1024 * 4 / 8],
233            ),
234            5 => pack_8_5(
235                array_ref![input, 0, 1024],
236                array_mut_ref![output, 0, 1024 * 5 / 8],
237            ),
238            6 => pack_8_6(
239                array_ref![input, 0, 1024],
240                array_mut_ref![output, 0, 1024 * 6 / 8],
241            ),
242            7 => pack_8_7(
243                array_ref![input, 0, 1024],
244                array_mut_ref![output, 0, 1024 * 7 / 8],
245            ),
246            8 => pack_8_8(
247                array_ref![input, 0, 1024],
248                array_mut_ref![output, 0, 1024 * 8 / 8],
249            ),
250
251            _ => unreachable!("Unsupported width: {}", width),
252        }
253    }
254
255    unsafe fn unchecked_unpack(width: usize, input: &[Self], output: &mut [Self]) {
256        let packed_len = 128 * width / size_of::<Self>();
257        debug_assert_eq!(
258            input.len(),
259            packed_len,
260            "Input buffer must be of size 1024 * W / T"
261        );
262        debug_assert_eq!(output.len(), 1024, "Output buffer must be of size 1024");
263        debug_assert!(
264            width <= Self::T,
265            "Width must be less than or equal to {}",
266            Self::T
267        );
268
269        match width {
270            0 => {
271                // A zero-width packed chunk implies all zeros.
272                output.fill(0);
273            }
274            1 => unpack_8_1(
275                array_ref![input, 0, 1024 / 8],
276                array_mut_ref![output, 0, 1024],
277            ),
278            2 => unpack_8_2(
279                array_ref![input, 0, 1024 * 2 / 8],
280                array_mut_ref![output, 0, 1024],
281            ),
282            3 => unpack_8_3(
283                array_ref![input, 0, 1024 * 3 / 8],
284                array_mut_ref![output, 0, 1024],
285            ),
286            4 => unpack_8_4(
287                array_ref![input, 0, 1024 * 4 / 8],
288                array_mut_ref![output, 0, 1024],
289            ),
290            5 => unpack_8_5(
291                array_ref![input, 0, 1024 * 5 / 8],
292                array_mut_ref![output, 0, 1024],
293            ),
294            6 => unpack_8_6(
295                array_ref![input, 0, 1024 * 6 / 8],
296                array_mut_ref![output, 0, 1024],
297            ),
298            7 => unpack_8_7(
299                array_ref![input, 0, 1024 * 7 / 8],
300                array_mut_ref![output, 0, 1024],
301            ),
302            8 => unpack_8_8(
303                array_ref![input, 0, 1024 * 8 / 8],
304                array_mut_ref![output, 0, 1024],
305            ),
306
307            _ => unreachable!("Unsupported width: {}", width),
308        }
309    }
310}
311
312impl BitPacking for u16 {
313    unsafe fn unchecked_pack(width: usize, input: &[Self], output: &mut [Self]) {
314        let packed_len = 128 * width / size_of::<Self>();
315        debug_assert_eq!(
316            output.len(),
317            packed_len,
318            "Output buffer must be of size 1024 * W / T"
319        );
320        debug_assert_eq!(input.len(), 1024, "Input buffer must be of size 1024");
321        debug_assert!(
322            width <= Self::T,
323            "Width must be less than or equal to {}",
324            Self::T
325        );
326
327        match width {
328            0 => {
329                // Nothing to write when width is zero.
330            }
331            1 => pack_16_1(
332                array_ref![input, 0, 1024],
333                array_mut_ref![output, 0, 1024 / 16],
334            ),
335            2 => pack_16_2(
336                array_ref![input, 0, 1024],
337                array_mut_ref![output, 0, 1024 * 2 / 16],
338            ),
339            3 => pack_16_3(
340                array_ref![input, 0, 1024],
341                array_mut_ref![output, 0, 1024 * 3 / 16],
342            ),
343            4 => pack_16_4(
344                array_ref![input, 0, 1024],
345                array_mut_ref![output, 0, 1024 * 4 / 16],
346            ),
347            5 => pack_16_5(
348                array_ref![input, 0, 1024],
349                array_mut_ref![output, 0, 1024 * 5 / 16],
350            ),
351            6 => pack_16_6(
352                array_ref![input, 0, 1024],
353                array_mut_ref![output, 0, 1024 * 6 / 16],
354            ),
355            7 => pack_16_7(
356                array_ref![input, 0, 1024],
357                array_mut_ref![output, 0, 1024 * 7 / 16],
358            ),
359            8 => pack_16_8(
360                array_ref![input, 0, 1024],
361                array_mut_ref![output, 0, 1024 * 8 / 16],
362            ),
363            9 => pack_16_9(
364                array_ref![input, 0, 1024],
365                array_mut_ref![output, 0, 1024 * 9 / 16],
366            ),
367
368            10 => pack_16_10(
369                array_ref![input, 0, 1024],
370                array_mut_ref![output, 0, 1024 * 10 / 16],
371            ),
372            11 => pack_16_11(
373                array_ref![input, 0, 1024],
374                array_mut_ref![output, 0, 1024 * 11 / 16],
375            ),
376            12 => pack_16_12(
377                array_ref![input, 0, 1024],
378                array_mut_ref![output, 0, 1024 * 12 / 16],
379            ),
380            13 => pack_16_13(
381                array_ref![input, 0, 1024],
382                array_mut_ref![output, 0, 1024 * 13 / 16],
383            ),
384            14 => pack_16_14(
385                array_ref![input, 0, 1024],
386                array_mut_ref![output, 0, 1024 * 14 / 16],
387            ),
388            15 => pack_16_15(
389                array_ref![input, 0, 1024],
390                array_mut_ref![output, 0, 1024 * 15 / 16],
391            ),
392            16 => pack_16_16(
393                array_ref![input, 0, 1024],
394                array_mut_ref![output, 0, 1024 * 16 / 16],
395            ),
396
397            _ => unreachable!("Unsupported width: {}", width),
398        }
399    }
400
401    unsafe fn unchecked_unpack(width: usize, input: &[Self], output: &mut [Self]) {
402        let packed_len = 128 * width / size_of::<Self>();
403        debug_assert_eq!(
404            input.len(),
405            packed_len,
406            "Input buffer must be of size 1024 * W / T"
407        );
408        debug_assert_eq!(output.len(), 1024, "Output buffer must be of size 1024");
409        debug_assert!(
410            width <= Self::T,
411            "Width must be less than or equal to {}",
412            Self::T
413        );
414
415        match width {
416            0 => {
417                output.fill(0);
418            }
419            1 => unpack_16_1(
420                array_ref![input, 0, 1024 / 16],
421                array_mut_ref![output, 0, 1024],
422            ),
423            2 => unpack_16_2(
424                array_ref![input, 0, 1024 * 2 / 16],
425                array_mut_ref![output, 0, 1024],
426            ),
427            3 => unpack_16_3(
428                array_ref![input, 0, 1024 * 3 / 16],
429                array_mut_ref![output, 0, 1024],
430            ),
431            4 => unpack_16_4(
432                array_ref![input, 0, 1024 * 4 / 16],
433                array_mut_ref![output, 0, 1024],
434            ),
435            5 => unpack_16_5(
436                array_ref![input, 0, 1024 * 5 / 16],
437                array_mut_ref![output, 0, 1024],
438            ),
439            6 => unpack_16_6(
440                array_ref![input, 0, 1024 * 6 / 16],
441                array_mut_ref![output, 0, 1024],
442            ),
443            7 => unpack_16_7(
444                array_ref![input, 0, 1024 * 7 / 16],
445                array_mut_ref![output, 0, 1024],
446            ),
447            8 => unpack_16_8(
448                array_ref![input, 0, 1024 * 8 / 16],
449                array_mut_ref![output, 0, 1024],
450            ),
451            9 => unpack_16_9(
452                array_ref![input, 0, 1024 * 9 / 16],
453                array_mut_ref![output, 0, 1024],
454            ),
455
456            10 => unpack_16_10(
457                array_ref![input, 0, 1024 * 10 / 16],
458                array_mut_ref![output, 0, 1024],
459            ),
460            11 => unpack_16_11(
461                array_ref![input, 0, 1024 * 11 / 16],
462                array_mut_ref![output, 0, 1024],
463            ),
464            12 => unpack_16_12(
465                array_ref![input, 0, 1024 * 12 / 16],
466                array_mut_ref![output, 0, 1024],
467            ),
468            13 => unpack_16_13(
469                array_ref![input, 0, 1024 * 13 / 16],
470                array_mut_ref![output, 0, 1024],
471            ),
472            14 => unpack_16_14(
473                array_ref![input, 0, 1024 * 14 / 16],
474                array_mut_ref![output, 0, 1024],
475            ),
476            15 => unpack_16_15(
477                array_ref![input, 0, 1024 * 15 / 16],
478                array_mut_ref![output, 0, 1024],
479            ),
480            16 => unpack_16_16(
481                array_ref![input, 0, 1024 * 16 / 16],
482                array_mut_ref![output, 0, 1024],
483            ),
484
485            _ => unreachable!("Unsupported width: {}", width),
486        }
487    }
488}
489
490impl BitPacking for u32 {
491    unsafe fn unchecked_pack(width: usize, input: &[Self], output: &mut [Self]) {
492        let packed_len = 128 * width / size_of::<Self>();
493        debug_assert_eq!(
494            output.len(),
495            packed_len,
496            "Output buffer must be of size 1024 * W / T"
497        );
498        debug_assert_eq!(input.len(), 1024, "Input buffer must be of size 1024");
499        debug_assert!(
500            width <= Self::T,
501            "Width must be less than or equal to {}",
502            Self::T
503        );
504
505        match width {
506            0 => {
507                // Nothing to write when width is zero.
508            }
509            1 => pack_32_1(
510                array_ref![input, 0, 1024],
511                array_mut_ref![output, 0, 1024 / 32],
512            ),
513            2 => pack_32_2(
514                array_ref![input, 0, 1024],
515                array_mut_ref![output, 0, 1024 * 2 / 32],
516            ),
517            3 => pack_32_3(
518                array_ref![input, 0, 1024],
519                array_mut_ref![output, 0, 1024 * 3 / 32],
520            ),
521            4 => pack_32_4(
522                array_ref![input, 0, 1024],
523                array_mut_ref![output, 0, 1024 * 4 / 32],
524            ),
525            5 => pack_32_5(
526                array_ref![input, 0, 1024],
527                array_mut_ref![output, 0, 1024 * 5 / 32],
528            ),
529            6 => pack_32_6(
530                array_ref![input, 0, 1024],
531                array_mut_ref![output, 0, 1024 * 6 / 32],
532            ),
533            7 => pack_32_7(
534                array_ref![input, 0, 1024],
535                array_mut_ref![output, 0, 1024 * 7 / 32],
536            ),
537            8 => pack_32_8(
538                array_ref![input, 0, 1024],
539                array_mut_ref![output, 0, 1024 * 8 / 32],
540            ),
541            9 => pack_32_9(
542                array_ref![input, 0, 1024],
543                array_mut_ref![output, 0, 1024 * 9 / 32],
544            ),
545
546            10 => pack_32_10(
547                array_ref![input, 0, 1024],
548                array_mut_ref![output, 0, 1024 * 10 / 32],
549            ),
550            11 => pack_32_11(
551                array_ref![input, 0, 1024],
552                array_mut_ref![output, 0, 1024 * 11 / 32],
553            ),
554            12 => pack_32_12(
555                array_ref![input, 0, 1024],
556                array_mut_ref![output, 0, 1024 * 12 / 32],
557            ),
558            13 => pack_32_13(
559                array_ref![input, 0, 1024],
560                array_mut_ref![output, 0, 1024 * 13 / 32],
561            ),
562            14 => pack_32_14(
563                array_ref![input, 0, 1024],
564                array_mut_ref![output, 0, 1024 * 14 / 32],
565            ),
566            15 => pack_32_15(
567                array_ref![input, 0, 1024],
568                array_mut_ref![output, 0, 1024 * 15 / 32],
569            ),
570            16 => pack_32_16(
571                array_ref![input, 0, 1024],
572                array_mut_ref![output, 0, 1024 * 16 / 32],
573            ),
574            17 => pack_32_17(
575                array_ref![input, 0, 1024],
576                array_mut_ref![output, 0, 1024 * 17 / 32],
577            ),
578            18 => pack_32_18(
579                array_ref![input, 0, 1024],
580                array_mut_ref![output, 0, 1024 * 18 / 32],
581            ),
582            19 => pack_32_19(
583                array_ref![input, 0, 1024],
584                array_mut_ref![output, 0, 1024 * 19 / 32],
585            ),
586
587            20 => pack_32_20(
588                array_ref![input, 0, 1024],
589                array_mut_ref![output, 0, 1024 * 20 / 32],
590            ),
591            21 => pack_32_21(
592                array_ref![input, 0, 1024],
593                array_mut_ref![output, 0, 1024 * 21 / 32],
594            ),
595            22 => pack_32_22(
596                array_ref![input, 0, 1024],
597                array_mut_ref![output, 0, 1024 * 22 / 32],
598            ),
599            23 => pack_32_23(
600                array_ref![input, 0, 1024],
601                array_mut_ref![output, 0, 1024 * 23 / 32],
602            ),
603            24 => pack_32_24(
604                array_ref![input, 0, 1024],
605                array_mut_ref![output, 0, 1024 * 24 / 32],
606            ),
607            25 => pack_32_25(
608                array_ref![input, 0, 1024],
609                array_mut_ref![output, 0, 1024 * 25 / 32],
610            ),
611            26 => pack_32_26(
612                array_ref![input, 0, 1024],
613                array_mut_ref![output, 0, 1024 * 26 / 32],
614            ),
615            27 => pack_32_27(
616                array_ref![input, 0, 1024],
617                array_mut_ref![output, 0, 1024 * 27 / 32],
618            ),
619            28 => pack_32_28(
620                array_ref![input, 0, 1024],
621                array_mut_ref![output, 0, 1024 * 28 / 32],
622            ),
623            29 => pack_32_29(
624                array_ref![input, 0, 1024],
625                array_mut_ref![output, 0, 1024 * 29 / 32],
626            ),
627
628            30 => pack_32_30(
629                array_ref![input, 0, 1024],
630                array_mut_ref![output, 0, 1024 * 30 / 32],
631            ),
632            31 => pack_32_31(
633                array_ref![input, 0, 1024],
634                array_mut_ref![output, 0, 1024 * 31 / 32],
635            ),
636            32 => pack_32_32(
637                array_ref![input, 0, 1024],
638                array_mut_ref![output, 0, 1024 * 32 / 32],
639            ),
640
641            _ => unreachable!("Unsupported width: {}", width),
642        }
643    }
644
645    unsafe fn unchecked_unpack(width: usize, input: &[Self], output: &mut [Self]) {
646        let packed_len = 128 * width / size_of::<Self>();
647        debug_assert_eq!(
648            input.len(),
649            packed_len,
650            "Input buffer must be of size 1024 * W / T"
651        );
652        debug_assert_eq!(output.len(), 1024, "Output buffer must be of size 1024");
653        debug_assert!(
654            width <= Self::T,
655            "Width must be less than or equal to {}",
656            Self::T
657        );
658
659        match width {
660            0 => {
661                output.fill(0);
662            }
663            1 => unpack_32_1(
664                array_ref![input, 0, 1024 / 32],
665                array_mut_ref![output, 0, 1024],
666            ),
667            2 => unpack_32_2(
668                array_ref![input, 0, 1024 * 2 / 32],
669                array_mut_ref![output, 0, 1024],
670            ),
671            3 => unpack_32_3(
672                array_ref![input, 0, 1024 * 3 / 32],
673                array_mut_ref![output, 0, 1024],
674            ),
675            4 => unpack_32_4(
676                array_ref![input, 0, 1024 * 4 / 32],
677                array_mut_ref![output, 0, 1024],
678            ),
679            5 => unpack_32_5(
680                array_ref![input, 0, 1024 * 5 / 32],
681                array_mut_ref![output, 0, 1024],
682            ),
683            6 => unpack_32_6(
684                array_ref![input, 0, 1024 * 6 / 32],
685                array_mut_ref![output, 0, 1024],
686            ),
687            7 => unpack_32_7(
688                array_ref![input, 0, 1024 * 7 / 32],
689                array_mut_ref![output, 0, 1024],
690            ),
691            8 => unpack_32_8(
692                array_ref![input, 0, 1024 * 8 / 32],
693                array_mut_ref![output, 0, 1024],
694            ),
695            9 => unpack_32_9(
696                array_ref![input, 0, 1024 * 9 / 32],
697                array_mut_ref![output, 0, 1024],
698            ),
699
700            10 => unpack_32_10(
701                array_ref![input, 0, 1024 * 10 / 32],
702                array_mut_ref![output, 0, 1024],
703            ),
704            11 => unpack_32_11(
705                array_ref![input, 0, 1024 * 11 / 32],
706                array_mut_ref![output, 0, 1024],
707            ),
708            12 => unpack_32_12(
709                array_ref![input, 0, 1024 * 12 / 32],
710                array_mut_ref![output, 0, 1024],
711            ),
712            13 => unpack_32_13(
713                array_ref![input, 0, 1024 * 13 / 32],
714                array_mut_ref![output, 0, 1024],
715            ),
716            14 => unpack_32_14(
717                array_ref![input, 0, 1024 * 14 / 32],
718                array_mut_ref![output, 0, 1024],
719            ),
720            15 => unpack_32_15(
721                array_ref![input, 0, 1024 * 15 / 32],
722                array_mut_ref![output, 0, 1024],
723            ),
724            16 => unpack_32_16(
725                array_ref![input, 0, 1024 * 16 / 32],
726                array_mut_ref![output, 0, 1024],
727            ),
728            17 => unpack_32_17(
729                array_ref![input, 0, 1024 * 17 / 32],
730                array_mut_ref![output, 0, 1024],
731            ),
732            18 => unpack_32_18(
733                array_ref![input, 0, 1024 * 18 / 32],
734                array_mut_ref![output, 0, 1024],
735            ),
736            19 => unpack_32_19(
737                array_ref![input, 0, 1024 * 19 / 32],
738                array_mut_ref![output, 0, 1024],
739            ),
740
741            20 => unpack_32_20(
742                array_ref![input, 0, 1024 * 20 / 32],
743                array_mut_ref![output, 0, 1024],
744            ),
745            21 => unpack_32_21(
746                array_ref![input, 0, 1024 * 21 / 32],
747                array_mut_ref![output, 0, 1024],
748            ),
749            22 => unpack_32_22(
750                array_ref![input, 0, 1024 * 22 / 32],
751                array_mut_ref![output, 0, 1024],
752            ),
753            23 => unpack_32_23(
754                array_ref![input, 0, 1024 * 23 / 32],
755                array_mut_ref![output, 0, 1024],
756            ),
757            24 => unpack_32_24(
758                array_ref![input, 0, 1024 * 24 / 32],
759                array_mut_ref![output, 0, 1024],
760            ),
761            25 => unpack_32_25(
762                array_ref![input, 0, 1024 * 25 / 32],
763                array_mut_ref![output, 0, 1024],
764            ),
765            26 => unpack_32_26(
766                array_ref![input, 0, 1024 * 26 / 32],
767                array_mut_ref![output, 0, 1024],
768            ),
769            27 => unpack_32_27(
770                array_ref![input, 0, 1024 * 27 / 32],
771                array_mut_ref![output, 0, 1024],
772            ),
773            28 => unpack_32_28(
774                array_ref![input, 0, 1024 * 28 / 32],
775                array_mut_ref![output, 0, 1024],
776            ),
777            29 => unpack_32_29(
778                array_ref![input, 0, 1024 * 29 / 32],
779                array_mut_ref![output, 0, 1024],
780            ),
781
782            30 => unpack_32_30(
783                array_ref![input, 0, 1024 * 30 / 32],
784                array_mut_ref![output, 0, 1024],
785            ),
786            31 => unpack_32_31(
787                array_ref![input, 0, 1024 * 31 / 32],
788                array_mut_ref![output, 0, 1024],
789            ),
790            32 => unpack_32_32(
791                array_ref![input, 0, 1024 * 32 / 32],
792                array_mut_ref![output, 0, 1024],
793            ),
794
795            _ => unreachable!("Unsupported width: {}", width),
796        }
797    }
798}
799
800impl BitPacking for u64 {
801    unsafe fn unchecked_pack(width: usize, input: &[Self], output: &mut [Self]) {
802        let packed_len = 128 * width / size_of::<Self>();
803        debug_assert_eq!(
804            output.len(),
805            packed_len,
806            "Output buffer must be of size 1024 * W / T"
807        );
808        debug_assert_eq!(input.len(), 1024, "Input buffer must be of size 1024");
809        debug_assert!(
810            width <= Self::T,
811            "Width must be less than or equal to {}",
812            Self::T
813        );
814
815        match width {
816            0 => {
817                // Nothing to write when width is zero.
818            }
819            1 => pack_64_1(
820                array_ref![input, 0, 1024],
821                array_mut_ref![output, 0, 1024 / 64],
822            ),
823            2 => pack_64_2(
824                array_ref![input, 0, 1024],
825                array_mut_ref![output, 0, 1024 * 2 / 64],
826            ),
827            3 => pack_64_3(
828                array_ref![input, 0, 1024],
829                array_mut_ref![output, 0, 1024 * 3 / 64],
830            ),
831            4 => pack_64_4(
832                array_ref![input, 0, 1024],
833                array_mut_ref![output, 0, 1024 * 4 / 64],
834            ),
835            5 => pack_64_5(
836                array_ref![input, 0, 1024],
837                array_mut_ref![output, 0, 1024 * 5 / 64],
838            ),
839            6 => pack_64_6(
840                array_ref![input, 0, 1024],
841                array_mut_ref![output, 0, 1024 * 6 / 64],
842            ),
843            7 => pack_64_7(
844                array_ref![input, 0, 1024],
845                array_mut_ref![output, 0, 1024 * 7 / 64],
846            ),
847            8 => pack_64_8(
848                array_ref![input, 0, 1024],
849                array_mut_ref![output, 0, 1024 * 8 / 64],
850            ),
851            9 => pack_64_9(
852                array_ref![input, 0, 1024],
853                array_mut_ref![output, 0, 1024 * 9 / 64],
854            ),
855
856            10 => pack_64_10(
857                array_ref![input, 0, 1024],
858                array_mut_ref![output, 0, 1024 * 10 / 64],
859            ),
860            11 => pack_64_11(
861                array_ref![input, 0, 1024],
862                array_mut_ref![output, 0, 1024 * 11 / 64],
863            ),
864            12 => pack_64_12(
865                array_ref![input, 0, 1024],
866                array_mut_ref![output, 0, 1024 * 12 / 64],
867            ),
868            13 => pack_64_13(
869                array_ref![input, 0, 1024],
870                array_mut_ref![output, 0, 1024 * 13 / 64],
871            ),
872            14 => pack_64_14(
873                array_ref![input, 0, 1024],
874                array_mut_ref![output, 0, 1024 * 14 / 64],
875            ),
876            15 => pack_64_15(
877                array_ref![input, 0, 1024],
878                array_mut_ref![output, 0, 1024 * 15 / 64],
879            ),
880            16 => pack_64_16(
881                array_ref![input, 0, 1024],
882                array_mut_ref![output, 0, 1024 * 16 / 64],
883            ),
884            17 => pack_64_17(
885                array_ref![input, 0, 1024],
886                array_mut_ref![output, 0, 1024 * 17 / 64],
887            ),
888            18 => pack_64_18(
889                array_ref![input, 0, 1024],
890                array_mut_ref![output, 0, 1024 * 18 / 64],
891            ),
892            19 => pack_64_19(
893                array_ref![input, 0, 1024],
894                array_mut_ref![output, 0, 1024 * 19 / 64],
895            ),
896
897            20 => pack_64_20(
898                array_ref![input, 0, 1024],
899                array_mut_ref![output, 0, 1024 * 20 / 64],
900            ),
901            21 => pack_64_21(
902                array_ref![input, 0, 1024],
903                array_mut_ref![output, 0, 1024 * 21 / 64],
904            ),
905            22 => pack_64_22(
906                array_ref![input, 0, 1024],
907                array_mut_ref![output, 0, 1024 * 22 / 64],
908            ),
909            23 => pack_64_23(
910                array_ref![input, 0, 1024],
911                array_mut_ref![output, 0, 1024 * 23 / 64],
912            ),
913            24 => pack_64_24(
914                array_ref![input, 0, 1024],
915                array_mut_ref![output, 0, 1024 * 24 / 64],
916            ),
917            25 => pack_64_25(
918                array_ref![input, 0, 1024],
919                array_mut_ref![output, 0, 1024 * 25 / 64],
920            ),
921            26 => pack_64_26(
922                array_ref![input, 0, 1024],
923                array_mut_ref![output, 0, 1024 * 26 / 64],
924            ),
925            27 => pack_64_27(
926                array_ref![input, 0, 1024],
927                array_mut_ref![output, 0, 1024 * 27 / 64],
928            ),
929            28 => pack_64_28(
930                array_ref![input, 0, 1024],
931                array_mut_ref![output, 0, 1024 * 28 / 64],
932            ),
933            29 => pack_64_29(
934                array_ref![input, 0, 1024],
935                array_mut_ref![output, 0, 1024 * 29 / 64],
936            ),
937
938            30 => pack_64_30(
939                array_ref![input, 0, 1024],
940                array_mut_ref![output, 0, 1024 * 30 / 64],
941            ),
942            31 => pack_64_31(
943                array_ref![input, 0, 1024],
944                array_mut_ref![output, 0, 1024 * 31 / 64],
945            ),
946            32 => pack_64_32(
947                array_ref![input, 0, 1024],
948                array_mut_ref![output, 0, 1024 * 32 / 64],
949            ),
950            33 => pack_64_33(
951                array_ref![input, 0, 1024],
952                array_mut_ref![output, 0, 1024 * 33 / 64],
953            ),
954            34 => pack_64_34(
955                array_ref![input, 0, 1024],
956                array_mut_ref![output, 0, 1024 * 34 / 64],
957            ),
958            35 => pack_64_35(
959                array_ref![input, 0, 1024],
960                array_mut_ref![output, 0, 1024 * 35 / 64],
961            ),
962            36 => pack_64_36(
963                array_ref![input, 0, 1024],
964                array_mut_ref![output, 0, 1024 * 36 / 64],
965            ),
966            37 => pack_64_37(
967                array_ref![input, 0, 1024],
968                array_mut_ref![output, 0, 1024 * 37 / 64],
969            ),
970            38 => pack_64_38(
971                array_ref![input, 0, 1024],
972                array_mut_ref![output, 0, 1024 * 38 / 64],
973            ),
974            39 => pack_64_39(
975                array_ref![input, 0, 1024],
976                array_mut_ref![output, 0, 1024 * 39 / 64],
977            ),
978
979            40 => pack_64_40(
980                array_ref![input, 0, 1024],
981                array_mut_ref![output, 0, 1024 * 40 / 64],
982            ),
983            41 => pack_64_41(
984                array_ref![input, 0, 1024],
985                array_mut_ref![output, 0, 1024 * 41 / 64],
986            ),
987            42 => pack_64_42(
988                array_ref![input, 0, 1024],
989                array_mut_ref![output, 0, 1024 * 42 / 64],
990            ),
991            43 => pack_64_43(
992                array_ref![input, 0, 1024],
993                array_mut_ref![output, 0, 1024 * 43 / 64],
994            ),
995            44 => pack_64_44(
996                array_ref![input, 0, 1024],
997                array_mut_ref![output, 0, 1024 * 44 / 64],
998            ),
999            45 => pack_64_45(
1000                array_ref![input, 0, 1024],
1001                array_mut_ref![output, 0, 1024 * 45 / 64],
1002            ),
1003            46 => pack_64_46(
1004                array_ref![input, 0, 1024],
1005                array_mut_ref![output, 0, 1024 * 46 / 64],
1006            ),
1007            47 => pack_64_47(
1008                array_ref![input, 0, 1024],
1009                array_mut_ref![output, 0, 1024 * 47 / 64],
1010            ),
1011            48 => pack_64_48(
1012                array_ref![input, 0, 1024],
1013                array_mut_ref![output, 0, 1024 * 48 / 64],
1014            ),
1015            49 => pack_64_49(
1016                array_ref![input, 0, 1024],
1017                array_mut_ref![output, 0, 1024 * 49 / 64],
1018            ),
1019
1020            50 => pack_64_50(
1021                array_ref![input, 0, 1024],
1022                array_mut_ref![output, 0, 1024 * 50 / 64],
1023            ),
1024            51 => pack_64_51(
1025                array_ref![input, 0, 1024],
1026                array_mut_ref![output, 0, 1024 * 51 / 64],
1027            ),
1028            52 => pack_64_52(
1029                array_ref![input, 0, 1024],
1030                array_mut_ref![output, 0, 1024 * 52 / 64],
1031            ),
1032            53 => pack_64_53(
1033                array_ref![input, 0, 1024],
1034                array_mut_ref![output, 0, 1024 * 53 / 64],
1035            ),
1036            54 => pack_64_54(
1037                array_ref![input, 0, 1024],
1038                array_mut_ref![output, 0, 1024 * 54 / 64],
1039            ),
1040            55 => pack_64_55(
1041                array_ref![input, 0, 1024],
1042                array_mut_ref![output, 0, 1024 * 55 / 64],
1043            ),
1044            56 => pack_64_56(
1045                array_ref![input, 0, 1024],
1046                array_mut_ref![output, 0, 1024 * 56 / 64],
1047            ),
1048            57 => pack_64_57(
1049                array_ref![input, 0, 1024],
1050                array_mut_ref![output, 0, 1024 * 57 / 64],
1051            ),
1052            58 => pack_64_58(
1053                array_ref![input, 0, 1024],
1054                array_mut_ref![output, 0, 1024 * 58 / 64],
1055            ),
1056            59 => pack_64_59(
1057                array_ref![input, 0, 1024],
1058                array_mut_ref![output, 0, 1024 * 59 / 64],
1059            ),
1060
1061            60 => pack_64_60(
1062                array_ref![input, 0, 1024],
1063                array_mut_ref![output, 0, 1024 * 60 / 64],
1064            ),
1065            61 => pack_64_61(
1066                array_ref![input, 0, 1024],
1067                array_mut_ref![output, 0, 1024 * 61 / 64],
1068            ),
1069            62 => pack_64_62(
1070                array_ref![input, 0, 1024],
1071                array_mut_ref![output, 0, 1024 * 62 / 64],
1072            ),
1073            63 => pack_64_63(
1074                array_ref![input, 0, 1024],
1075                array_mut_ref![output, 0, 1024 * 63 / 64],
1076            ),
1077            64 => pack_64_64(
1078                array_ref![input, 0, 1024],
1079                array_mut_ref![output, 0, 1024 * 64 / 64],
1080            ),
1081
1082            _ => unreachable!("Unsupported width: {}", width),
1083        }
1084    }
1085
1086    unsafe fn unchecked_unpack(width: usize, input: &[Self], output: &mut [Self]) {
1087        let packed_len = 128 * width / size_of::<Self>();
1088        debug_assert_eq!(
1089            input.len(),
1090            packed_len,
1091            "Input buffer must be of size 1024 * W / T"
1092        );
1093        debug_assert_eq!(output.len(), 1024, "Output buffer must be of size 1024");
1094        debug_assert!(
1095            width <= Self::T,
1096            "Width must be less than or equal to {}",
1097            Self::T
1098        );
1099
1100        match width {
1101            0 => {
1102                output.fill(0);
1103            }
1104            1 => unpack_64_1(
1105                array_ref![input, 0, 1024 / 64],
1106                array_mut_ref![output, 0, 1024],
1107            ),
1108            2 => unpack_64_2(
1109                array_ref![input, 0, 1024 * 2 / 64],
1110                array_mut_ref![output, 0, 1024],
1111            ),
1112            3 => unpack_64_3(
1113                array_ref![input, 0, 1024 * 3 / 64],
1114                array_mut_ref![output, 0, 1024],
1115            ),
1116            4 => unpack_64_4(
1117                array_ref![input, 0, 1024 * 4 / 64],
1118                array_mut_ref![output, 0, 1024],
1119            ),
1120            5 => unpack_64_5(
1121                array_ref![input, 0, 1024 * 5 / 64],
1122                array_mut_ref![output, 0, 1024],
1123            ),
1124            6 => unpack_64_6(
1125                array_ref![input, 0, 1024 * 6 / 64],
1126                array_mut_ref![output, 0, 1024],
1127            ),
1128            7 => unpack_64_7(
1129                array_ref![input, 0, 1024 * 7 / 64],
1130                array_mut_ref![output, 0, 1024],
1131            ),
1132            8 => unpack_64_8(
1133                array_ref![input, 0, 1024 * 8 / 64],
1134                array_mut_ref![output, 0, 1024],
1135            ),
1136            9 => unpack_64_9(
1137                array_ref![input, 0, 1024 * 9 / 64],
1138                array_mut_ref![output, 0, 1024],
1139            ),
1140
1141            10 => unpack_64_10(
1142                array_ref![input, 0, 1024 * 10 / 64],
1143                array_mut_ref![output, 0, 1024],
1144            ),
1145            11 => unpack_64_11(
1146                array_ref![input, 0, 1024 * 11 / 64],
1147                array_mut_ref![output, 0, 1024],
1148            ),
1149            12 => unpack_64_12(
1150                array_ref![input, 0, 1024 * 12 / 64],
1151                array_mut_ref![output, 0, 1024],
1152            ),
1153            13 => unpack_64_13(
1154                array_ref![input, 0, 1024 * 13 / 64],
1155                array_mut_ref![output, 0, 1024],
1156            ),
1157            14 => unpack_64_14(
1158                array_ref![input, 0, 1024 * 14 / 64],
1159                array_mut_ref![output, 0, 1024],
1160            ),
1161            15 => unpack_64_15(
1162                array_ref![input, 0, 1024 * 15 / 64],
1163                array_mut_ref![output, 0, 1024],
1164            ),
1165            16 => unpack_64_16(
1166                array_ref![input, 0, 1024 * 16 / 64],
1167                array_mut_ref![output, 0, 1024],
1168            ),
1169            17 => unpack_64_17(
1170                array_ref![input, 0, 1024 * 17 / 64],
1171                array_mut_ref![output, 0, 1024],
1172            ),
1173            18 => unpack_64_18(
1174                array_ref![input, 0, 1024 * 18 / 64],
1175                array_mut_ref![output, 0, 1024],
1176            ),
1177            19 => unpack_64_19(
1178                array_ref![input, 0, 1024 * 19 / 64],
1179                array_mut_ref![output, 0, 1024],
1180            ),
1181
1182            20 => unpack_64_20(
1183                array_ref![input, 0, 1024 * 20 / 64],
1184                array_mut_ref![output, 0, 1024],
1185            ),
1186            21 => unpack_64_21(
1187                array_ref![input, 0, 1024 * 21 / 64],
1188                array_mut_ref![output, 0, 1024],
1189            ),
1190            22 => unpack_64_22(
1191                array_ref![input, 0, 1024 * 22 / 64],
1192                array_mut_ref![output, 0, 1024],
1193            ),
1194            23 => unpack_64_23(
1195                array_ref![input, 0, 1024 * 23 / 64],
1196                array_mut_ref![output, 0, 1024],
1197            ),
1198            24 => unpack_64_24(
1199                array_ref![input, 0, 1024 * 24 / 64],
1200                array_mut_ref![output, 0, 1024],
1201            ),
1202            25 => unpack_64_25(
1203                array_ref![input, 0, 1024 * 25 / 64],
1204                array_mut_ref![output, 0, 1024],
1205            ),
1206            26 => unpack_64_26(
1207                array_ref![input, 0, 1024 * 26 / 64],
1208                array_mut_ref![output, 0, 1024],
1209            ),
1210            27 => unpack_64_27(
1211                array_ref![input, 0, 1024 * 27 / 64],
1212                array_mut_ref![output, 0, 1024],
1213            ),
1214            28 => unpack_64_28(
1215                array_ref![input, 0, 1024 * 28 / 64],
1216                array_mut_ref![output, 0, 1024],
1217            ),
1218            29 => unpack_64_29(
1219                array_ref![input, 0, 1024 * 29 / 64],
1220                array_mut_ref![output, 0, 1024],
1221            ),
1222
1223            30 => unpack_64_30(
1224                array_ref![input, 0, 1024 * 30 / 64],
1225                array_mut_ref![output, 0, 1024],
1226            ),
1227            31 => unpack_64_31(
1228                array_ref![input, 0, 1024 * 31 / 64],
1229                array_mut_ref![output, 0, 1024],
1230            ),
1231            32 => unpack_64_32(
1232                array_ref![input, 0, 1024 * 32 / 64],
1233                array_mut_ref![output, 0, 1024],
1234            ),
1235            33 => unpack_64_33(
1236                array_ref![input, 0, 1024 * 33 / 64],
1237                array_mut_ref![output, 0, 1024],
1238            ),
1239            34 => unpack_64_34(
1240                array_ref![input, 0, 1024 * 34 / 64],
1241                array_mut_ref![output, 0, 1024],
1242            ),
1243            35 => unpack_64_35(
1244                array_ref![input, 0, 1024 * 35 / 64],
1245                array_mut_ref![output, 0, 1024],
1246            ),
1247            36 => unpack_64_36(
1248                array_ref![input, 0, 1024 * 36 / 64],
1249                array_mut_ref![output, 0, 1024],
1250            ),
1251            37 => unpack_64_37(
1252                array_ref![input, 0, 1024 * 37 / 64],
1253                array_mut_ref![output, 0, 1024],
1254            ),
1255            38 => unpack_64_38(
1256                array_ref![input, 0, 1024 * 38 / 64],
1257                array_mut_ref![output, 0, 1024],
1258            ),
1259            39 => unpack_64_39(
1260                array_ref![input, 0, 1024 * 39 / 64],
1261                array_mut_ref![output, 0, 1024],
1262            ),
1263
1264            40 => unpack_64_40(
1265                array_ref![input, 0, 1024 * 40 / 64],
1266                array_mut_ref![output, 0, 1024],
1267            ),
1268            41 => unpack_64_41(
1269                array_ref![input, 0, 1024 * 41 / 64],
1270                array_mut_ref![output, 0, 1024],
1271            ),
1272            42 => unpack_64_42(
1273                array_ref![input, 0, 1024 * 42 / 64],
1274                array_mut_ref![output, 0, 1024],
1275            ),
1276            43 => unpack_64_43(
1277                array_ref![input, 0, 1024 * 43 / 64],
1278                array_mut_ref![output, 0, 1024],
1279            ),
1280            44 => unpack_64_44(
1281                array_ref![input, 0, 1024 * 44 / 64],
1282                array_mut_ref![output, 0, 1024],
1283            ),
1284            45 => unpack_64_45(
1285                array_ref![input, 0, 1024 * 45 / 64],
1286                array_mut_ref![output, 0, 1024],
1287            ),
1288            46 => unpack_64_46(
1289                array_ref![input, 0, 1024 * 46 / 64],
1290                array_mut_ref![output, 0, 1024],
1291            ),
1292            47 => unpack_64_47(
1293                array_ref![input, 0, 1024 * 47 / 64],
1294                array_mut_ref![output, 0, 1024],
1295            ),
1296            48 => unpack_64_48(
1297                array_ref![input, 0, 1024 * 48 / 64],
1298                array_mut_ref![output, 0, 1024],
1299            ),
1300            49 => unpack_64_49(
1301                array_ref![input, 0, 1024 * 49 / 64],
1302                array_mut_ref![output, 0, 1024],
1303            ),
1304
1305            50 => unpack_64_50(
1306                array_ref![input, 0, 1024 * 50 / 64],
1307                array_mut_ref![output, 0, 1024],
1308            ),
1309            51 => unpack_64_51(
1310                array_ref![input, 0, 1024 * 51 / 64],
1311                array_mut_ref![output, 0, 1024],
1312            ),
1313            52 => unpack_64_52(
1314                array_ref![input, 0, 1024 * 52 / 64],
1315                array_mut_ref![output, 0, 1024],
1316            ),
1317            53 => unpack_64_53(
1318                array_ref![input, 0, 1024 * 53 / 64],
1319                array_mut_ref![output, 0, 1024],
1320            ),
1321            54 => unpack_64_54(
1322                array_ref![input, 0, 1024 * 54 / 64],
1323                array_mut_ref![output, 0, 1024],
1324            ),
1325            55 => unpack_64_55(
1326                array_ref![input, 0, 1024 * 55 / 64],
1327                array_mut_ref![output, 0, 1024],
1328            ),
1329            56 => unpack_64_56(
1330                array_ref![input, 0, 1024 * 56 / 64],
1331                array_mut_ref![output, 0, 1024],
1332            ),
1333            57 => unpack_64_57(
1334                array_ref![input, 0, 1024 * 57 / 64],
1335                array_mut_ref![output, 0, 1024],
1336            ),
1337            58 => unpack_64_58(
1338                array_ref![input, 0, 1024 * 58 / 64],
1339                array_mut_ref![output, 0, 1024],
1340            ),
1341            59 => unpack_64_59(
1342                array_ref![input, 0, 1024 * 59 / 64],
1343                array_mut_ref![output, 0, 1024],
1344            ),
1345
1346            60 => unpack_64_60(
1347                array_ref![input, 0, 1024 * 60 / 64],
1348                array_mut_ref![output, 0, 1024],
1349            ),
1350            61 => unpack_64_61(
1351                array_ref![input, 0, 1024 * 61 / 64],
1352                array_mut_ref![output, 0, 1024],
1353            ),
1354            62 => unpack_64_62(
1355                array_ref![input, 0, 1024 * 62 / 64],
1356                array_mut_ref![output, 0, 1024],
1357            ),
1358            63 => unpack_64_63(
1359                array_ref![input, 0, 1024 * 63 / 64],
1360                array_mut_ref![output, 0, 1024],
1361            ),
1362            64 => unpack_64_64(
1363                array_ref![input, 0, 1024 * 64 / 64],
1364                array_mut_ref![output, 0, 1024],
1365            ),
1366
1367            _ => unreachable!("Unsupported width: {}", width),
1368        }
1369    }
1370}
1371
1372macro_rules! unpack_8 {
1373    ($name:ident, $bits:expr) => {
1374        fn $name(input: &[u8; 1024 * $bits / u8::T], output: &mut [u8; 1024]) {
1375            for lane in 0..u8::LANES {
1376                unpack!(u8, $bits, input, lane, |$idx, $elem| {
1377                    output[$idx] = $elem;
1378                });
1379            }
1380        }
1381    };
1382}
1383
1384unpack_8!(unpack_8_1, 1);
1385unpack_8!(unpack_8_2, 2);
1386unpack_8!(unpack_8_3, 3);
1387unpack_8!(unpack_8_4, 4);
1388unpack_8!(unpack_8_5, 5);
1389unpack_8!(unpack_8_6, 6);
1390unpack_8!(unpack_8_7, 7);
1391unpack_8!(unpack_8_8, 8);
1392
1393macro_rules! pack_8 {
1394    ($name:ident, $bits:expr) => {
1395        fn $name(input: &[u8; 1024], output: &mut [u8; 1024 * $bits / u8::T]) {
1396            for lane in 0..u8::LANES {
1397                pack!(u8, $bits, output, lane, |$idx| { input[$idx] });
1398            }
1399        }
1400    };
1401}
1402pack_8!(pack_8_1, 1);
1403pack_8!(pack_8_2, 2);
1404pack_8!(pack_8_3, 3);
1405pack_8!(pack_8_4, 4);
1406pack_8!(pack_8_5, 5);
1407pack_8!(pack_8_6, 6);
1408pack_8!(pack_8_7, 7);
1409pack_8!(pack_8_8, 8);
1410
1411macro_rules! unpack_16 {
1412    ($name:ident, $bits:expr) => {
1413        fn $name(input: &[u16; 1024 * $bits / u16::T], output: &mut [u16; 1024]) {
1414            for lane in 0..u16::LANES {
1415                unpack!(u16, $bits, input, lane, |$idx, $elem| {
1416                    output[$idx] = $elem;
1417                });
1418            }
1419        }
1420    };
1421}
1422
1423unpack_16!(unpack_16_1, 1);
1424unpack_16!(unpack_16_2, 2);
1425unpack_16!(unpack_16_3, 3);
1426unpack_16!(unpack_16_4, 4);
1427unpack_16!(unpack_16_5, 5);
1428unpack_16!(unpack_16_6, 6);
1429unpack_16!(unpack_16_7, 7);
1430unpack_16!(unpack_16_8, 8);
1431unpack_16!(unpack_16_9, 9);
1432unpack_16!(unpack_16_10, 10);
1433unpack_16!(unpack_16_11, 11);
1434unpack_16!(unpack_16_12, 12);
1435unpack_16!(unpack_16_13, 13);
1436unpack_16!(unpack_16_14, 14);
1437unpack_16!(unpack_16_15, 15);
1438unpack_16!(unpack_16_16, 16);
1439
1440macro_rules! pack_16 {
1441    ($name:ident, $bits:expr) => {
1442        fn $name(input: &[u16; 1024], output: &mut [u16; 1024 * $bits / u16::T]) {
1443            for lane in 0..u16::LANES {
1444                pack!(u16, $bits, output, lane, |$idx| { input[$idx] });
1445            }
1446        }
1447    };
1448}
1449
1450pack_16!(pack_16_1, 1);
1451pack_16!(pack_16_2, 2);
1452pack_16!(pack_16_3, 3);
1453pack_16!(pack_16_4, 4);
1454pack_16!(pack_16_5, 5);
1455pack_16!(pack_16_6, 6);
1456pack_16!(pack_16_7, 7);
1457pack_16!(pack_16_8, 8);
1458pack_16!(pack_16_9, 9);
1459pack_16!(pack_16_10, 10);
1460pack_16!(pack_16_11, 11);
1461pack_16!(pack_16_12, 12);
1462pack_16!(pack_16_13, 13);
1463pack_16!(pack_16_14, 14);
1464pack_16!(pack_16_15, 15);
1465pack_16!(pack_16_16, 16);
1466
1467macro_rules! unpack_32 {
1468    ($name:ident, $bit_width:expr) => {
1469        fn $name(input: &[u32; 1024 * $bit_width / u32::T], output: &mut [u32; 1024]) {
1470            for lane in 0..u32::LANES {
1471                unpack!(u32, $bit_width, input, lane, |$idx, $elem| {
1472                    output[$idx] = $elem
1473                });
1474            }
1475        }
1476    };
1477}
1478
1479unpack_32!(unpack_32_1, 1);
1480unpack_32!(unpack_32_2, 2);
1481unpack_32!(unpack_32_3, 3);
1482unpack_32!(unpack_32_4, 4);
1483unpack_32!(unpack_32_5, 5);
1484unpack_32!(unpack_32_6, 6);
1485unpack_32!(unpack_32_7, 7);
1486unpack_32!(unpack_32_8, 8);
1487unpack_32!(unpack_32_9, 9);
1488unpack_32!(unpack_32_10, 10);
1489unpack_32!(unpack_32_11, 11);
1490unpack_32!(unpack_32_12, 12);
1491unpack_32!(unpack_32_13, 13);
1492unpack_32!(unpack_32_14, 14);
1493unpack_32!(unpack_32_15, 15);
1494unpack_32!(unpack_32_16, 16);
1495unpack_32!(unpack_32_17, 17);
1496unpack_32!(unpack_32_18, 18);
1497unpack_32!(unpack_32_19, 19);
1498unpack_32!(unpack_32_20, 20);
1499unpack_32!(unpack_32_21, 21);
1500unpack_32!(unpack_32_22, 22);
1501unpack_32!(unpack_32_23, 23);
1502unpack_32!(unpack_32_24, 24);
1503unpack_32!(unpack_32_25, 25);
1504unpack_32!(unpack_32_26, 26);
1505unpack_32!(unpack_32_27, 27);
1506unpack_32!(unpack_32_28, 28);
1507unpack_32!(unpack_32_29, 29);
1508unpack_32!(unpack_32_30, 30);
1509unpack_32!(unpack_32_31, 31);
1510unpack_32!(unpack_32_32, 32);
1511
1512macro_rules! pack_32 {
1513    ($name:ident, $bits:expr) => {
1514        fn $name(input: &[u32; 1024], output: &mut [u32; 1024 * $bits / u32::BITS as usize]) {
1515            for lane in 0..u32::LANES {
1516                pack!(u32, $bits, output, lane, |$idx| { input[$idx] });
1517            }
1518        }
1519    };
1520}
1521
1522pack_32!(pack_32_1, 1);
1523pack_32!(pack_32_2, 2);
1524pack_32!(pack_32_3, 3);
1525pack_32!(pack_32_4, 4);
1526pack_32!(pack_32_5, 5);
1527pack_32!(pack_32_6, 6);
1528pack_32!(pack_32_7, 7);
1529pack_32!(pack_32_8, 8);
1530pack_32!(pack_32_9, 9);
1531pack_32!(pack_32_10, 10);
1532pack_32!(pack_32_11, 11);
1533pack_32!(pack_32_12, 12);
1534pack_32!(pack_32_13, 13);
1535pack_32!(pack_32_14, 14);
1536pack_32!(pack_32_15, 15);
1537pack_32!(pack_32_16, 16);
1538pack_32!(pack_32_17, 17);
1539pack_32!(pack_32_18, 18);
1540pack_32!(pack_32_19, 19);
1541pack_32!(pack_32_20, 20);
1542pack_32!(pack_32_21, 21);
1543pack_32!(pack_32_22, 22);
1544pack_32!(pack_32_23, 23);
1545pack_32!(pack_32_24, 24);
1546pack_32!(pack_32_25, 25);
1547pack_32!(pack_32_26, 26);
1548pack_32!(pack_32_27, 27);
1549pack_32!(pack_32_28, 28);
1550pack_32!(pack_32_29, 29);
1551pack_32!(pack_32_30, 30);
1552pack_32!(pack_32_31, 31);
1553pack_32!(pack_32_32, 32);
1554
1555macro_rules! unpack_64 {
1556    ($name:ident, $bit_width:expr) => {
1557        fn $name(input: &[u64; 1024 * $bit_width / u64::T], output: &mut [u64; 1024]) {
1558            for lane in 0..u64::LANES {
1559                unpack!(u64, $bit_width, input, lane, |$idx, $elem| {
1560                    output[$idx] = $elem
1561                });
1562            }
1563        }
1564    };
1565}
1566
1567unpack_64!(unpack_64_1, 1);
1568unpack_64!(unpack_64_2, 2);
1569unpack_64!(unpack_64_3, 3);
1570unpack_64!(unpack_64_4, 4);
1571unpack_64!(unpack_64_5, 5);
1572unpack_64!(unpack_64_6, 6);
1573unpack_64!(unpack_64_7, 7);
1574unpack_64!(unpack_64_8, 8);
1575unpack_64!(unpack_64_9, 9);
1576unpack_64!(unpack_64_10, 10);
1577unpack_64!(unpack_64_11, 11);
1578unpack_64!(unpack_64_12, 12);
1579unpack_64!(unpack_64_13, 13);
1580unpack_64!(unpack_64_14, 14);
1581unpack_64!(unpack_64_15, 15);
1582unpack_64!(unpack_64_16, 16);
1583unpack_64!(unpack_64_17, 17);
1584unpack_64!(unpack_64_18, 18);
1585unpack_64!(unpack_64_19, 19);
1586unpack_64!(unpack_64_20, 20);
1587unpack_64!(unpack_64_21, 21);
1588unpack_64!(unpack_64_22, 22);
1589unpack_64!(unpack_64_23, 23);
1590unpack_64!(unpack_64_24, 24);
1591unpack_64!(unpack_64_25, 25);
1592unpack_64!(unpack_64_26, 26);
1593unpack_64!(unpack_64_27, 27);
1594unpack_64!(unpack_64_28, 28);
1595unpack_64!(unpack_64_29, 29);
1596unpack_64!(unpack_64_30, 30);
1597unpack_64!(unpack_64_31, 31);
1598unpack_64!(unpack_64_32, 32);
1599
1600unpack_64!(unpack_64_33, 33);
1601unpack_64!(unpack_64_34, 34);
1602unpack_64!(unpack_64_35, 35);
1603unpack_64!(unpack_64_36, 36);
1604unpack_64!(unpack_64_37, 37);
1605unpack_64!(unpack_64_38, 38);
1606unpack_64!(unpack_64_39, 39);
1607unpack_64!(unpack_64_40, 40);
1608unpack_64!(unpack_64_41, 41);
1609unpack_64!(unpack_64_42, 42);
1610unpack_64!(unpack_64_43, 43);
1611unpack_64!(unpack_64_44, 44);
1612unpack_64!(unpack_64_45, 45);
1613unpack_64!(unpack_64_46, 46);
1614unpack_64!(unpack_64_47, 47);
1615unpack_64!(unpack_64_48, 48);
1616unpack_64!(unpack_64_49, 49);
1617unpack_64!(unpack_64_50, 50);
1618unpack_64!(unpack_64_51, 51);
1619unpack_64!(unpack_64_52, 52);
1620unpack_64!(unpack_64_53, 53);
1621unpack_64!(unpack_64_54, 54);
1622unpack_64!(unpack_64_55, 55);
1623unpack_64!(unpack_64_56, 56);
1624unpack_64!(unpack_64_57, 57);
1625unpack_64!(unpack_64_58, 58);
1626unpack_64!(unpack_64_59, 59);
1627unpack_64!(unpack_64_60, 60);
1628unpack_64!(unpack_64_61, 61);
1629unpack_64!(unpack_64_62, 62);
1630unpack_64!(unpack_64_63, 63);
1631unpack_64!(unpack_64_64, 64);
1632
1633macro_rules! pack_64 {
1634    ($name:ident, $bits:expr) => {
1635        fn $name(input: &[u64; 1024], output: &mut [u64; 1024 * $bits / u64::BITS as usize]) {
1636            for lane in 0..u64::LANES {
1637                pack!(u64, $bits, output, lane, |$idx| { input[$idx] });
1638            }
1639        }
1640    };
1641}
1642
1643pack_64!(pack_64_1, 1);
1644pack_64!(pack_64_2, 2);
1645pack_64!(pack_64_3, 3);
1646pack_64!(pack_64_4, 4);
1647pack_64!(pack_64_5, 5);
1648pack_64!(pack_64_6, 6);
1649pack_64!(pack_64_7, 7);
1650pack_64!(pack_64_8, 8);
1651pack_64!(pack_64_9, 9);
1652pack_64!(pack_64_10, 10);
1653pack_64!(pack_64_11, 11);
1654pack_64!(pack_64_12, 12);
1655pack_64!(pack_64_13, 13);
1656pack_64!(pack_64_14, 14);
1657pack_64!(pack_64_15, 15);
1658pack_64!(pack_64_16, 16);
1659pack_64!(pack_64_17, 17);
1660pack_64!(pack_64_18, 18);
1661pack_64!(pack_64_19, 19);
1662pack_64!(pack_64_20, 20);
1663pack_64!(pack_64_21, 21);
1664pack_64!(pack_64_22, 22);
1665pack_64!(pack_64_23, 23);
1666pack_64!(pack_64_24, 24);
1667pack_64!(pack_64_25, 25);
1668pack_64!(pack_64_26, 26);
1669pack_64!(pack_64_27, 27);
1670pack_64!(pack_64_28, 28);
1671pack_64!(pack_64_29, 29);
1672pack_64!(pack_64_30, 30);
1673pack_64!(pack_64_31, 31);
1674pack_64!(pack_64_32, 32);
1675
1676pack_64!(pack_64_33, 33);
1677pack_64!(pack_64_34, 34);
1678pack_64!(pack_64_35, 35);
1679pack_64!(pack_64_36, 36);
1680pack_64!(pack_64_37, 37);
1681pack_64!(pack_64_38, 38);
1682pack_64!(pack_64_39, 39);
1683pack_64!(pack_64_40, 40);
1684pack_64!(pack_64_41, 41);
1685pack_64!(pack_64_42, 42);
1686pack_64!(pack_64_43, 43);
1687pack_64!(pack_64_44, 44);
1688pack_64!(pack_64_45, 45);
1689pack_64!(pack_64_46, 46);
1690pack_64!(pack_64_47, 47);
1691pack_64!(pack_64_48, 48);
1692pack_64!(pack_64_49, 49);
1693pack_64!(pack_64_50, 50);
1694pack_64!(pack_64_51, 51);
1695pack_64!(pack_64_52, 52);
1696pack_64!(pack_64_53, 53);
1697pack_64!(pack_64_54, 54);
1698pack_64!(pack_64_55, 55);
1699pack_64!(pack_64_56, 56);
1700pack_64!(pack_64_57, 57);
1701pack_64!(pack_64_58, 58);
1702pack_64!(pack_64_59, 59);
1703pack_64!(pack_64_60, 60);
1704pack_64!(pack_64_61, 61);
1705pack_64!(pack_64_62, 62);
1706pack_64!(pack_64_63, 63);
1707pack_64!(pack_64_64, 64);
1708
1709#[cfg(test)]
1710mod test {
1711    use super::*;
1712    use core::array;
1713    // a fast random number generator
1714    pub struct XorShift {
1715        state: u64,
1716    }
1717
1718    impl XorShift {
1719        pub fn new(seed: u64) -> Self {
1720            Self { state: seed }
1721        }
1722
1723        pub fn next(&mut self) -> u64 {
1724            let mut x = self.state;
1725            x ^= x << 13;
1726            x ^= x >> 7;
1727            x ^= x << 17;
1728            self.state = x;
1729            x
1730        }
1731    }
1732
1733    // a macro version of this function generalize u8, u16, u32, u64 takes very long time for a test build, so I
1734    // write it for each type separately
1735    fn pack_unpack_u8(bit_width: usize) {
1736        let mut values: [u8; 1024] = [0; 1024];
1737        let mut rng = XorShift::new(123456789);
1738        for value in &mut values {
1739            *value = (rng.next() % (1 << bit_width)) as u8;
1740        }
1741
1742        let mut packed = vec![0; 1024 * bit_width / 8];
1743        for lane in 0..u8::LANES {
1744            // Always loop over lanes first. This is what the compiler vectorizes.
1745            pack!(u8, bit_width, packed, lane, |$pos| {
1746                values[$pos]
1747            });
1748        }
1749
1750        let mut unpacked: [u8; 1024] = [0; 1024];
1751        for lane in 0..u8::LANES {
1752            // Always loop over lanes first. This is what the compiler vectorizes.
1753            unpack!(u8, bit_width, packed, lane, |$idx, $elem| {
1754                unpacked[$idx] = $elem;
1755            });
1756        }
1757
1758        assert_eq!(values, unpacked);
1759    }
1760
1761    fn pack_unpack_u16(bit_width: usize) {
1762        let mut values: [u16; 1024] = [0; 1024];
1763        let mut rng = XorShift::new(123456789);
1764        for value in &mut values {
1765            *value = (rng.next() % (1 << bit_width)) as u16;
1766        }
1767
1768        let mut packed = vec![0; 1024 * bit_width / 16];
1769        for lane in 0..u16::LANES {
1770            // Always loop over lanes first. This is what the compiler vectorizes.
1771            pack!(u16, bit_width, packed, lane, |$pos| {
1772                values[$pos]
1773            });
1774        }
1775
1776        let mut unpacked: [u16; 1024] = [0; 1024];
1777        for lane in 0..u16::LANES {
1778            // Always loop over lanes first. This is what the compiler vectorizes.
1779            unpack!(u16, bit_width, packed, lane, |$idx, $elem| {
1780                unpacked[$idx] = $elem;
1781            });
1782        }
1783
1784        assert_eq!(values, unpacked);
1785    }
1786
1787    fn pack_unpack_u32(bit_width: usize) {
1788        let mut values: [u32; 1024] = [0; 1024];
1789        let mut rng = XorShift::new(123456789);
1790        for value in &mut values {
1791            *value = (rng.next() % (1 << bit_width)) as u32;
1792        }
1793
1794        let mut packed = vec![0; 1024 * bit_width / 32];
1795        for lane in 0..u32::LANES {
1796            // Always loop over lanes first. This is what the compiler vectorizes.
1797            pack!(u32, bit_width, packed, lane, |$pos| {
1798                values[$pos]
1799            });
1800        }
1801
1802        let mut unpacked: [u32; 1024] = [0; 1024];
1803        for lane in 0..u32::LANES {
1804            // Always loop over lanes first. This is what the compiler vectorizes.
1805            unpack!(u32, bit_width, packed, lane, |$idx, $elem| {
1806                unpacked[$idx] = $elem;
1807            });
1808        }
1809
1810        assert_eq!(values, unpacked);
1811    }
1812
1813    fn pack_unpack_u64(bit_width: usize) {
1814        let mut values: [u64; 1024] = [0; 1024];
1815        let mut rng = XorShift::new(123456789);
1816        if bit_width == 64 {
1817            for value in &mut values {
1818                *value = rng.next();
1819            }
1820        } else {
1821            for value in &mut values {
1822                *value = rng.next() % (1 << bit_width);
1823            }
1824        }
1825
1826        let mut packed = vec![0; 1024 * bit_width / 64];
1827        for lane in 0..u64::LANES {
1828            // Always loop over lanes first. This is what the compiler vectorizes.
1829            pack!(u64, bit_width, packed, lane, |$pos| {
1830                values[$pos]
1831            });
1832        }
1833
1834        let mut unpacked: [u64; 1024] = [0; 1024];
1835        for lane in 0..u64::LANES {
1836            // Always loop over lanes first. This is what the compiler vectorizes.
1837            unpack!(u64, bit_width, packed, lane, |$idx, $elem| {
1838                unpacked[$idx] = $elem;
1839            });
1840        }
1841
1842        assert_eq!(values, unpacked);
1843    }
1844
1845    #[test]
1846    fn test_pack() {
1847        pack_unpack_u8(0);
1848        pack_unpack_u8(1);
1849        pack_unpack_u8(2);
1850        pack_unpack_u8(3);
1851        pack_unpack_u8(4);
1852        pack_unpack_u8(5);
1853        pack_unpack_u8(6);
1854        pack_unpack_u8(7);
1855        pack_unpack_u8(8);
1856
1857        pack_unpack_u16(0);
1858        pack_unpack_u16(1);
1859        pack_unpack_u16(2);
1860        pack_unpack_u16(3);
1861        pack_unpack_u16(4);
1862        pack_unpack_u16(5);
1863        pack_unpack_u16(6);
1864        pack_unpack_u16(7);
1865        pack_unpack_u16(8);
1866        pack_unpack_u16(9);
1867        pack_unpack_u16(10);
1868        pack_unpack_u16(11);
1869        pack_unpack_u16(12);
1870        pack_unpack_u16(13);
1871        pack_unpack_u16(14);
1872        pack_unpack_u16(15);
1873        pack_unpack_u16(16);
1874
1875        pack_unpack_u32(0);
1876        pack_unpack_u32(1);
1877        pack_unpack_u32(2);
1878        pack_unpack_u32(3);
1879        pack_unpack_u32(4);
1880        pack_unpack_u32(5);
1881        pack_unpack_u32(6);
1882        pack_unpack_u32(7);
1883        pack_unpack_u32(8);
1884        pack_unpack_u32(9);
1885        pack_unpack_u32(10);
1886        pack_unpack_u32(11);
1887        pack_unpack_u32(12);
1888        pack_unpack_u32(13);
1889        pack_unpack_u32(14);
1890        pack_unpack_u32(15);
1891        pack_unpack_u32(16);
1892        pack_unpack_u32(17);
1893        pack_unpack_u32(18);
1894        pack_unpack_u32(19);
1895        pack_unpack_u32(20);
1896        pack_unpack_u32(21);
1897        pack_unpack_u32(22);
1898        pack_unpack_u32(23);
1899        pack_unpack_u32(24);
1900        pack_unpack_u32(25);
1901        pack_unpack_u32(26);
1902        pack_unpack_u32(27);
1903        pack_unpack_u32(28);
1904        pack_unpack_u32(29);
1905        pack_unpack_u32(30);
1906        pack_unpack_u32(31);
1907        pack_unpack_u32(32);
1908
1909        pack_unpack_u64(0);
1910        pack_unpack_u64(1);
1911        pack_unpack_u64(2);
1912        pack_unpack_u64(3);
1913        pack_unpack_u64(4);
1914        pack_unpack_u64(5);
1915        pack_unpack_u64(6);
1916        pack_unpack_u64(7);
1917        pack_unpack_u64(8);
1918        pack_unpack_u64(9);
1919        pack_unpack_u64(10);
1920        pack_unpack_u64(11);
1921        pack_unpack_u64(12);
1922        pack_unpack_u64(13);
1923        pack_unpack_u64(14);
1924        pack_unpack_u64(15);
1925        pack_unpack_u64(16);
1926        pack_unpack_u64(17);
1927        pack_unpack_u64(18);
1928        pack_unpack_u64(19);
1929        pack_unpack_u64(20);
1930        pack_unpack_u64(21);
1931        pack_unpack_u64(22);
1932        pack_unpack_u64(23);
1933        pack_unpack_u64(24);
1934        pack_unpack_u64(25);
1935        pack_unpack_u64(26);
1936        pack_unpack_u64(27);
1937        pack_unpack_u64(28);
1938        pack_unpack_u64(29);
1939        pack_unpack_u64(30);
1940        pack_unpack_u64(31);
1941        pack_unpack_u64(32);
1942        pack_unpack_u64(33);
1943        pack_unpack_u64(34);
1944        pack_unpack_u64(35);
1945        pack_unpack_u64(36);
1946        pack_unpack_u64(37);
1947        pack_unpack_u64(38);
1948        pack_unpack_u64(39);
1949        pack_unpack_u64(40);
1950        pack_unpack_u64(41);
1951        pack_unpack_u64(42);
1952        pack_unpack_u64(43);
1953        pack_unpack_u64(44);
1954        pack_unpack_u64(45);
1955        pack_unpack_u64(46);
1956        pack_unpack_u64(47);
1957        pack_unpack_u64(48);
1958        pack_unpack_u64(49);
1959        pack_unpack_u64(50);
1960        pack_unpack_u64(51);
1961        pack_unpack_u64(52);
1962        pack_unpack_u64(53);
1963        pack_unpack_u64(54);
1964        pack_unpack_u64(55);
1965        pack_unpack_u64(56);
1966        pack_unpack_u64(57);
1967        pack_unpack_u64(58);
1968        pack_unpack_u64(59);
1969        pack_unpack_u64(60);
1970        pack_unpack_u64(61);
1971        pack_unpack_u64(62);
1972        pack_unpack_u64(63);
1973        pack_unpack_u64(64);
1974    }
1975
1976    fn unchecked_pack_unpack_u8(bit_width: usize) {
1977        let mut values = [0u8; 1024];
1978        let mut rng = XorShift::new(123456789);
1979        for value in &mut values {
1980            *value = (rng.next() % (1 << bit_width)) as u8;
1981        }
1982        let mut packed = vec![0; 1024 * bit_width / 8];
1983        unsafe {
1984            BitPacking::unchecked_pack(bit_width, &values, &mut packed);
1985        }
1986        let mut output = [0; 1024];
1987        unsafe { BitPacking::unchecked_unpack(bit_width, &packed, &mut output) };
1988        assert_eq!(values, output);
1989    }
1990
1991    fn unchecked_pack_unpack_u16(bit_width: usize) {
1992        let mut values = [0u16; 1024];
1993        let mut rng = XorShift::new(123456789);
1994        for value in &mut values {
1995            *value = (rng.next() % (1 << bit_width)) as u16;
1996        }
1997        let mut packed = vec![0; 1024 * bit_width / u16::T];
1998        unsafe {
1999            BitPacking::unchecked_pack(bit_width, &values, &mut packed);
2000        }
2001        let mut output = [0; 1024];
2002        unsafe { BitPacking::unchecked_unpack(bit_width, &packed, &mut output) };
2003        assert_eq!(values, output);
2004    }
2005
2006    fn unchecked_pack_unpack_u32(bit_width: usize) {
2007        let mut values = [0u32; 1024];
2008        let mut rng = XorShift::new(123456789);
2009        for value in &mut values {
2010            *value = (rng.next() % (1 << bit_width)) as u32;
2011        }
2012        let mut packed = vec![0; 1024 * bit_width / u32::T];
2013        unsafe {
2014            BitPacking::unchecked_pack(bit_width, &values, &mut packed);
2015        }
2016        let mut output = [0; 1024];
2017        unsafe { BitPacking::unchecked_unpack(bit_width, &packed, &mut output) };
2018        assert_eq!(values, output);
2019    }
2020
2021    fn unchecked_pack_unpack_u64(bit_width: usize) {
2022        let mut values = [0u64; 1024];
2023        let mut rng = XorShift::new(123456789);
2024        if bit_width == 64 {
2025            for value in &mut values {
2026                *value = rng.next();
2027            }
2028        }
2029        let mut packed = vec![0; 1024 * bit_width / u64::T];
2030        unsafe {
2031            BitPacking::unchecked_pack(bit_width, &values, &mut packed);
2032        }
2033        let mut output = [0; 1024];
2034        unsafe { BitPacking::unchecked_unpack(bit_width, &packed, &mut output) };
2035        assert_eq!(values, output);
2036    }
2037
2038    #[test]
2039    fn test_unchecked_pack() {
2040        let input = array::from_fn(|i| i as u32);
2041        let mut packed = [0; 320];
2042        unsafe { BitPacking::unchecked_pack(10, &input, &mut packed) };
2043        let mut output = [0; 1024];
2044        unsafe { BitPacking::unchecked_unpack(10, &packed, &mut output) };
2045        assert_eq!(input, output);
2046
2047        unchecked_pack_unpack_u8(1);
2048        unchecked_pack_unpack_u8(2);
2049        unchecked_pack_unpack_u8(3);
2050        unchecked_pack_unpack_u8(4);
2051        unchecked_pack_unpack_u8(5);
2052        unchecked_pack_unpack_u8(6);
2053        unchecked_pack_unpack_u8(7);
2054        unchecked_pack_unpack_u8(8);
2055
2056        unchecked_pack_unpack_u16(1);
2057        unchecked_pack_unpack_u16(2);
2058        unchecked_pack_unpack_u16(3);
2059        unchecked_pack_unpack_u16(4);
2060        unchecked_pack_unpack_u16(5);
2061        unchecked_pack_unpack_u16(6);
2062        unchecked_pack_unpack_u16(7);
2063        unchecked_pack_unpack_u16(8);
2064        unchecked_pack_unpack_u16(9);
2065        unchecked_pack_unpack_u16(10);
2066        unchecked_pack_unpack_u16(11);
2067        unchecked_pack_unpack_u16(12);
2068        unchecked_pack_unpack_u16(13);
2069        unchecked_pack_unpack_u16(14);
2070        unchecked_pack_unpack_u16(15);
2071        unchecked_pack_unpack_u16(16);
2072
2073        unchecked_pack_unpack_u32(1);
2074        unchecked_pack_unpack_u32(2);
2075        unchecked_pack_unpack_u32(3);
2076        unchecked_pack_unpack_u32(4);
2077        unchecked_pack_unpack_u32(5);
2078        unchecked_pack_unpack_u32(6);
2079        unchecked_pack_unpack_u32(7);
2080        unchecked_pack_unpack_u32(8);
2081        unchecked_pack_unpack_u32(9);
2082        unchecked_pack_unpack_u32(10);
2083        unchecked_pack_unpack_u32(11);
2084        unchecked_pack_unpack_u32(12);
2085        unchecked_pack_unpack_u32(13);
2086        unchecked_pack_unpack_u32(14);
2087        unchecked_pack_unpack_u32(15);
2088        unchecked_pack_unpack_u32(16);
2089        unchecked_pack_unpack_u32(17);
2090        unchecked_pack_unpack_u32(18);
2091        unchecked_pack_unpack_u32(19);
2092        unchecked_pack_unpack_u32(20);
2093        unchecked_pack_unpack_u32(21);
2094        unchecked_pack_unpack_u32(22);
2095        unchecked_pack_unpack_u32(23);
2096        unchecked_pack_unpack_u32(24);
2097        unchecked_pack_unpack_u32(25);
2098        unchecked_pack_unpack_u32(26);
2099        unchecked_pack_unpack_u32(27);
2100        unchecked_pack_unpack_u32(28);
2101        unchecked_pack_unpack_u32(29);
2102        unchecked_pack_unpack_u32(30);
2103        unchecked_pack_unpack_u32(31);
2104        unchecked_pack_unpack_u32(32);
2105
2106        unchecked_pack_unpack_u64(1);
2107        unchecked_pack_unpack_u64(2);
2108        unchecked_pack_unpack_u64(3);
2109        unchecked_pack_unpack_u64(4);
2110        unchecked_pack_unpack_u64(5);
2111        unchecked_pack_unpack_u64(6);
2112        unchecked_pack_unpack_u64(7);
2113        unchecked_pack_unpack_u64(8);
2114        unchecked_pack_unpack_u64(9);
2115        unchecked_pack_unpack_u64(10);
2116        unchecked_pack_unpack_u64(11);
2117        unchecked_pack_unpack_u64(12);
2118        unchecked_pack_unpack_u64(13);
2119        unchecked_pack_unpack_u64(14);
2120        unchecked_pack_unpack_u64(15);
2121        unchecked_pack_unpack_u64(16);
2122        unchecked_pack_unpack_u64(17);
2123        unchecked_pack_unpack_u64(18);
2124        unchecked_pack_unpack_u64(19);
2125        unchecked_pack_unpack_u64(20);
2126        unchecked_pack_unpack_u64(21);
2127        unchecked_pack_unpack_u64(22);
2128        unchecked_pack_unpack_u64(23);
2129        unchecked_pack_unpack_u64(24);
2130        unchecked_pack_unpack_u64(25);
2131        unchecked_pack_unpack_u64(26);
2132        unchecked_pack_unpack_u64(27);
2133        unchecked_pack_unpack_u64(28);
2134        unchecked_pack_unpack_u64(29);
2135        unchecked_pack_unpack_u64(30);
2136        unchecked_pack_unpack_u64(31);
2137        unchecked_pack_unpack_u64(32);
2138        unchecked_pack_unpack_u64(33);
2139        unchecked_pack_unpack_u64(34);
2140        unchecked_pack_unpack_u64(35);
2141        unchecked_pack_unpack_u64(36);
2142        unchecked_pack_unpack_u64(37);
2143        unchecked_pack_unpack_u64(38);
2144        unchecked_pack_unpack_u64(39);
2145        unchecked_pack_unpack_u64(40);
2146        unchecked_pack_unpack_u64(41);
2147        unchecked_pack_unpack_u64(42);
2148        unchecked_pack_unpack_u64(43);
2149        unchecked_pack_unpack_u64(44);
2150        unchecked_pack_unpack_u64(45);
2151        unchecked_pack_unpack_u64(46);
2152        unchecked_pack_unpack_u64(47);
2153        unchecked_pack_unpack_u64(48);
2154        unchecked_pack_unpack_u64(49);
2155        unchecked_pack_unpack_u64(50);
2156        unchecked_pack_unpack_u64(51);
2157        unchecked_pack_unpack_u64(52);
2158        unchecked_pack_unpack_u64(53);
2159        unchecked_pack_unpack_u64(54);
2160        unchecked_pack_unpack_u64(55);
2161        unchecked_pack_unpack_u64(56);
2162        unchecked_pack_unpack_u64(57);
2163        unchecked_pack_unpack_u64(58);
2164        unchecked_pack_unpack_u64(59);
2165        unchecked_pack_unpack_u64(60);
2166        unchecked_pack_unpack_u64(61);
2167        unchecked_pack_unpack_u64(62);
2168        unchecked_pack_unpack_u64(63);
2169        unchecked_pack_unpack_u64(64);
2170    }
2171}