p3_util/
lib.rs

1//! Various simple utilities.
2
3#![no_std]
4
5extern crate alloc;
6
7use alloc::slice;
8use alloc::string::String;
9use alloc::vec::Vec;
10use core::any::type_name;
11use core::hint::unreachable_unchecked;
12use core::mem::{ManuallyDrop, MaybeUninit};
13use core::{iter, mem};
14
15use crate::transpose::transpose_in_place_square;
16
17pub mod array_serialization;
18pub mod linear_map;
19pub mod transpose;
20pub mod zip_eq;
21
22/// Computes `ceil(log_2(n))`.
23#[must_use]
24pub const fn log2_ceil_usize(n: usize) -> usize {
25    (usize::BITS - n.saturating_sub(1).leading_zeros()) as usize
26}
27
28#[must_use]
29pub fn log2_ceil_u64(n: u64) -> u64 {
30    (u64::BITS - n.saturating_sub(1).leading_zeros()).into()
31}
32
33/// Computes `log_2(n)`
34///
35/// # Panics
36/// Panics if `n` is not a power of two.
37#[must_use]
38#[inline]
39pub fn log2_strict_usize(n: usize) -> usize {
40    let res = n.trailing_zeros();
41    assert_eq!(n.wrapping_shr(res), 1, "Not a power of two: {n}");
42    // Tell the optimizer about the semantics of `log2_strict`. i.e. it can replace `n` with
43    // `1 << res` and vice versa.
44    assume(n == 1 << res);
45    res as usize
46}
47
48/// Returns `[0, ..., N - 1]`.
49#[must_use]
50pub const fn indices_arr<const N: usize>() -> [usize; N] {
51    let mut indices_arr = [0; N];
52    let mut i = 0;
53    while i < N {
54        indices_arr[i] = i;
55        i += 1;
56    }
57    indices_arr
58}
59
60#[inline]
61pub const fn reverse_bits(x: usize, n: usize) -> usize {
62    // Assert that n is a power of 2
63    debug_assert!(n.is_power_of_two());
64    reverse_bits_len(x, n.trailing_zeros() as usize)
65}
66
67#[inline]
68pub const fn reverse_bits_len(x: usize, bit_len: usize) -> usize {
69    // NB: The only reason we need overflowing_shr() here as opposed
70    // to plain '>>' is to accommodate the case n == num_bits == 0,
71    // which would become `0 >> 64`. Rust thinks that any shift of 64
72    // bits causes overflow, even when the argument is zero.
73    x.reverse_bits()
74        .overflowing_shr(usize::BITS - bit_len as u32)
75        .0
76}
77
78// Lookup table of 6-bit reverses.
79// NB: 2^6=64 bytes is a cache line. A smaller table wastes cache space.
80#[cfg(not(target_arch = "aarch64"))]
81#[rustfmt::skip]
82const BIT_REVERSE_6BIT: &[u8] = &[
83    0o00, 0o40, 0o20, 0o60, 0o10, 0o50, 0o30, 0o70,
84    0o04, 0o44, 0o24, 0o64, 0o14, 0o54, 0o34, 0o74,
85    0o02, 0o42, 0o22, 0o62, 0o12, 0o52, 0o32, 0o72,
86    0o06, 0o46, 0o26, 0o66, 0o16, 0o56, 0o36, 0o76,
87    0o01, 0o41, 0o21, 0o61, 0o11, 0o51, 0o31, 0o71,
88    0o05, 0o45, 0o25, 0o65, 0o15, 0o55, 0o35, 0o75,
89    0o03, 0o43, 0o23, 0o63, 0o13, 0o53, 0o33, 0o73,
90    0o07, 0o47, 0o27, 0o67, 0o17, 0o57, 0o37, 0o77,
91];
92
93// Ensure that SMALL_ARR_SIZE >= 4 * BIG_T_SIZE.
94const BIG_T_SIZE: usize = 1 << 14;
95const SMALL_ARR_SIZE: usize = 1 << 16;
96
97/// Permutes `arr` such that each index is mapped to its reverse in binary.
98///
99/// If the whole array fits in fast cache, then the trivial algorithm is cache friendly. Also, if
100/// `T` is really big, then the trivial algorithm is cache-friendly, no matter the size of the array.
101pub fn reverse_slice_index_bits<F>(vals: &mut [F])
102where
103    F: Copy + Send + Sync,
104{
105    let n = vals.len();
106    if n == 0 {
107        return;
108    }
109    let log_n = log2_strict_usize(n);
110
111    // If the whole array fits in fast cache, then the trivial algorithm is cache friendly. Also, if
112    // `T` is really big, then the trivial algorithm is cache-friendly, no matter the size of the array.
113    if core::mem::size_of::<F>() << log_n <= SMALL_ARR_SIZE
114        || core::mem::size_of::<F>() >= BIG_T_SIZE
115    {
116        reverse_slice_index_bits_small(vals, log_n);
117    } else {
118        debug_assert!(n >= 4); // By our choice of `BIG_T_SIZE` and `SMALL_ARR_SIZE`.
119
120        // Algorithm:
121        //
122        // Treat `arr` as a `sqrt(n)` by `sqrt(n)` row-major matrix. (Assume for now that `lb_n` is
123        // even, i.e., `n` is a square number.) To perform bit-order reversal we:
124        //  1. Bit-reverse the order of the rows. (They are contiguous in memory, so this is
125        //     basically a series of large `memcpy`s.)
126        //  2. Transpose the matrix.
127        //  3. Bit-reverse the order of the rows.
128        //
129        // This is equivalent to, for every index `0 <= i < n`:
130        //  1. bit-reversing `i[lb_n / 2..lb_n]`,
131        //  2. swapping `i[0..lb_n / 2]` and `i[lb_n / 2..lb_n]`,
132        //  3. bit-reversing `i[lb_n / 2..lb_n]`.
133        //
134        // If `lb_n` is odd, i.e., `n` is not a square number, then the above procedure requires
135        // slight modification. At steps 1 and 3 we bit-reverse bits `ceil(lb_n / 2)..lb_n`, of the
136        // index (shuffling `floor(lb_n / 2)` chunks of length `ceil(lb_n / 2)`). At step 2, we
137        // perform _two_ transposes. We treat `arr` as two matrices, one where the middle bit of the
138        // index is `0` and another, where the middle bit is `1`; we transpose each individually.
139
140        let lb_num_chunks = log_n >> 1;
141        let lb_chunk_size = log_n - lb_num_chunks;
142        unsafe {
143            reverse_slice_index_bits_chunks(vals, lb_num_chunks, lb_chunk_size);
144            transpose_in_place_square(vals, lb_chunk_size, lb_num_chunks, 0);
145            if lb_num_chunks != lb_chunk_size {
146                // `arr` cannot be interpreted as a square matrix. We instead interpret it as a
147                // `1 << lb_num_chunks` by `2` by `1 << lb_num_chunks` tensor, in row-major order.
148                // The above transpose acted on `tensor[..., 0, ...]` (all indices with middle bit
149                // `0`). We still need to transpose `tensor[..., 1, ...]`. To do so, we advance
150                // arr by `1 << lb_num_chunks` effectively, adding that to every index.
151                let vals_with_offset = &mut vals[1 << lb_num_chunks..];
152                transpose_in_place_square(vals_with_offset, lb_chunk_size, lb_num_chunks, 0);
153            }
154            reverse_slice_index_bits_chunks(vals, lb_num_chunks, lb_chunk_size);
155        }
156    }
157}
158
159// Both functions below are semantically equivalent to:
160//     for i in 0..n {
161//         result.push(arr[reverse_bits(i, n_power)]);
162//     }
163// where reverse_bits(i, n_power) computes the n_power-bit reverse. The complications are there
164// to guide the compiler to generate optimal assembly.
165
166#[cfg(not(target_arch = "aarch64"))]
167fn reverse_slice_index_bits_small<F>(vals: &mut [F], lb_n: usize) {
168    if lb_n <= 6 {
169        // BIT_REVERSE_6BIT holds 6-bit reverses. This shift makes them lb_n-bit reverses.
170        let dst_shr_amt = 6 - lb_n as u32;
171        #[allow(clippy::needless_range_loop)]
172        for src in 0..vals.len() {
173            let dst = (BIT_REVERSE_6BIT[src] as usize).wrapping_shr(dst_shr_amt);
174            if src < dst {
175                vals.swap(src, dst);
176            }
177        }
178    } else {
179        // LLVM does not know that it does not need to reverse src at each iteration (which is
180        // expensive on x86). We take advantage of the fact that the low bits of dst change rarely and the high
181        // bits of dst are dependent only on the low bits of src.
182        let dst_lo_shr_amt = usize::BITS - (lb_n - 6) as u32;
183        let dst_hi_shl_amt = lb_n - 6;
184        for src_chunk in 0..(vals.len() >> 6) {
185            let src_hi = src_chunk << 6;
186            let dst_lo = src_chunk.reverse_bits().wrapping_shr(dst_lo_shr_amt);
187            #[allow(clippy::needless_range_loop)]
188            for src_lo in 0..(1 << 6) {
189                let dst_hi = (BIT_REVERSE_6BIT[src_lo] as usize) << dst_hi_shl_amt;
190                let src = src_hi + src_lo;
191                let dst = dst_hi + dst_lo;
192                if src < dst {
193                    vals.swap(src, dst);
194                }
195            }
196        }
197    }
198}
199
200#[cfg(target_arch = "aarch64")]
201fn reverse_slice_index_bits_small<F>(vals: &mut [F], lb_n: usize) {
202    // Aarch64 can reverse bits in one instruction, so the trivial version works best.
203    for src in 0..vals.len() {
204        let dst = src.reverse_bits().wrapping_shr(usize::BITS - lb_n as u32);
205        if src < dst {
206            vals.swap(src, dst);
207        }
208    }
209}
210
211/// Split `arr` chunks and bit-reverse the order of the chunks. There are `1 << lb_num_chunks`
212/// chunks, each of length `1 << lb_chunk_size`.
213/// SAFETY: ensure that `arr.len() == 1 << lb_num_chunks + lb_chunk_size`.
214unsafe fn reverse_slice_index_bits_chunks<F>(
215    vals: &mut [F],
216    lb_num_chunks: usize,
217    lb_chunk_size: usize,
218) {
219    for i in 0..1usize << lb_num_chunks {
220        // `wrapping_shr` handles the silly case when `lb_num_chunks == 0`.
221        let j = i
222            .reverse_bits()
223            .wrapping_shr(usize::BITS - lb_num_chunks as u32);
224        if i < j {
225            unsafe {
226                core::ptr::swap_nonoverlapping(
227                    vals.get_unchecked_mut(i << lb_chunk_size),
228                    vals.get_unchecked_mut(j << lb_chunk_size),
229                    1 << lb_chunk_size,
230                );
231            }
232        }
233    }
234}
235
236#[inline(always)]
237pub fn assume(p: bool) {
238    debug_assert!(p);
239    if !p {
240        unsafe {
241            unreachable_unchecked();
242        }
243    }
244}
245
246/// Try to force Rust to emit a branch. Example:
247///
248/// ```no_run
249/// let x = 100;
250/// if x > 20 {
251///     println!("x is big!");
252///     p3_util::branch_hint();
253/// } else {
254///     println!("x is small!");
255/// }
256/// ```
257///
258/// This function has no semantics. It is a hint only.
259#[inline(always)]
260pub fn branch_hint() {
261    // NOTE: These are the currently supported assembly architectures. See the
262    // [nightly reference](https://doc.rust-lang.org/nightly/reference/inline-assembly.html) for
263    // the most up-to-date list.
264    #[cfg(any(
265        target_arch = "aarch64",
266        target_arch = "arm",
267        target_arch = "riscv32",
268        target_arch = "riscv64",
269        target_arch = "x86",
270        target_arch = "x86_64",
271    ))]
272    unsafe {
273        core::arch::asm!("", options(nomem, nostack, preserves_flags));
274    }
275}
276
277/// Return a String containing the name of T but with all the crate
278/// and module prefixes removed.
279pub fn pretty_name<T>() -> String {
280    let name = type_name::<T>();
281    let mut result = String::new();
282    for qual in name.split_inclusive(&['<', '>', ',']) {
283        result.push_str(qual.split("::").last().unwrap());
284    }
285    result
286}
287
288/// A C-style buffered input reader, similar to
289/// `core::iter::Iterator::next_chunk()` from nightly.
290///
291/// Returns an array of `MaybeUninit<T>` and the number of items in the
292/// array which have been correctly initialized.
293#[inline]
294fn iter_next_chunk_erased<const BUFLEN: usize, I: Iterator>(
295    iter: &mut I,
296) -> ([MaybeUninit<I::Item>; BUFLEN], usize)
297where
298    I::Item: Copy,
299{
300    let mut buf = [const { MaybeUninit::<I::Item>::uninit() }; BUFLEN];
301    let mut i = 0;
302
303    while i < BUFLEN {
304        if let Some(c) = iter.next() {
305            // Copy the next Item into `buf`.
306            unsafe {
307                buf.get_unchecked_mut(i).write(c);
308                i = i.unchecked_add(1);
309            }
310        } else {
311            // No more items in the iterator.
312            break;
313        }
314    }
315    (buf, i)
316}
317
318/// Gets a shared reference to the contained value.
319///
320/// # Safety
321///
322/// Calling this when the content is not yet fully initialized causes undefined
323/// behavior: it is up to the caller to guarantee that every `MaybeUninit<T>` in
324/// the slice really is in an initialized state.
325///
326/// Copied from:
327/// https://doc.rust-lang.org/std/primitive.slice.html#method.assume_init_ref
328/// Once that is stabilized, this should be removed.
329#[inline(always)]
330pub const unsafe fn assume_init_ref<T>(slice: &[MaybeUninit<T>]) -> &[T] {
331    // SAFETY: casting `slice` to a `*const [T]` is safe since the caller guarantees that
332    // `slice` is initialized, and `MaybeUninit` is guaranteed to have the same layout as `T`.
333    // The pointer obtained is valid since it refers to memory owned by `slice` which is a
334    // reference and thus guaranteed to be valid for reads.
335    unsafe { &*(slice as *const [MaybeUninit<T>] as *const [T]) }
336}
337
338/// Split an iterator into small arrays and apply `func` to each.
339///
340/// Repeatedly read `BUFLEN` elements from `input` into an array and
341/// pass the array to `func` as a slice. If less than `BUFLEN`
342/// elements are remaining, that smaller slice is passed to `func` (if
343/// it is non-empty) and the function returns.
344#[inline]
345pub fn apply_to_chunks<const BUFLEN: usize, I, H>(input: I, mut func: H)
346where
347    I: IntoIterator<Item = u8>,
348    H: FnMut(&[I::Item]),
349{
350    let mut iter = input.into_iter();
351    loop {
352        let (buf, n) = iter_next_chunk_erased::<BUFLEN, _>(&mut iter);
353        if n == 0 {
354            break;
355        }
356        func(unsafe { assume_init_ref(buf.get_unchecked(..n)) });
357    }
358}
359
360/// Pulls `N` items from `iter` and returns them as an array. If the iterator
361/// yields fewer than `N` items (but more than `0`), pads by the given default value.
362///
363/// Since the iterator is passed as a mutable reference and this function calls
364/// `next` at most `N` times, the iterator can still be used afterwards to
365/// retrieve the remaining items.
366///
367/// If `iter.next()` panics, all items already yielded by the iterator are
368/// dropped.
369#[inline]
370fn iter_next_chunk_padded<T: Copy, const N: usize>(
371    iter: &mut impl Iterator<Item = T>,
372    default: T, // Needed due to [T; M] not always implementing Default. Can probably be dropped if const generics stabilize.
373) -> Option<[T; N]> {
374    let (mut arr, n) = iter_next_chunk_erased::<N, _>(iter);
375    (n != 0).then(|| {
376        // Fill the rest of the array with default values.
377        arr[n..].fill(MaybeUninit::new(default));
378        unsafe { mem::transmute_copy::<_, [T; N]>(&arr) }
379    })
380}
381
382/// Returns an iterator over `N` elements of the iterator at a time.
383///
384/// The chunks do not overlap. If `N` does not divide the length of the
385/// iterator, then the last `N-1` elements will be padded with the given default value.
386///
387/// This is essentially a copy pasted version of the nightly `array_chunks` function.
388/// https://doc.rust-lang.org/std/iter/trait.Iterator.html#method.array_chunks
389/// Once that is stabilized this and the functions above it should be removed.
390#[inline]
391pub fn iter_array_chunks_padded<T: Copy, const N: usize>(
392    iter: impl IntoIterator<Item = T>,
393    default: T, // Needed due to [T; M] not always implementing Default. Can probably be dropped if const generics stabilize.
394) -> impl Iterator<Item = [T; N]> {
395    let mut iter = iter.into_iter();
396    iter::from_fn(move || iter_next_chunk_padded(&mut iter, default))
397}
398
399/// Reinterpret a slice of `BaseArray` elements as a slice of `Base` elements
400///
401/// This is useful to convert `&[F; N]` to `&[F]` or `&[A]` to `&[F]` where
402/// `A` has the same size, alignment and memory layout as `[F; N]` for some `N`.
403///
404/// # Safety
405///
406/// This is assumes that `BaseArray` has the same alignment and memory layout as `[Base; N]`.
407/// As Rust guarantees that arrays elements are contiguous in memory and the alignment of
408/// the array is the same as the alignment of its elements, this means that `BaseArray`
409/// must have the same alignment as `Base`.
410///
411/// # Panics
412///
413/// This panics if the size of `BaseArray` is not a multiple of the size of `Base`.
414#[inline]
415pub unsafe fn as_base_slice<Base, BaseArray>(buf: &[BaseArray]) -> &[Base] {
416    assert_eq!(align_of::<Base>(), align_of::<BaseArray>());
417    let d = size_of::<BaseArray>() / size_of::<Base>();
418
419    assert!(align_of::<BaseArray>() >= align_of::<Base>());
420    let buf_ptr = buf.as_ptr().cast::<Base>();
421    let n = buf.len() * d;
422    unsafe { slice::from_raw_parts(buf_ptr, n) }
423}
424
425/// Reinterpret a mutable slice of `BaseArray` elements as a slice of `Base` elements
426///
427/// This is useful to convert `&[F; N]` to `&[F]` or `&[A]` to `&[F]` where
428/// `A` has the same size, alignment and memory layout as `[F; N]` for some `N`.
429///
430/// # Safety
431///
432/// This is assumes that `BaseArray` has the same alignment and memory layout as `[Base; N]`.
433/// As Rust guarantees that arrays elements are contiguous in memory and the alignment of
434/// the array is the same as the alignment of its elements, this means that `BaseArray`
435/// must have the same alignment as `Base`.
436///
437/// # Panics
438///
439/// This panics if the size of `BaseArray` is not a multiple of the size of `Base`.
440#[inline]
441pub unsafe fn as_base_slice_mut<Base, BaseArray>(buf: &mut [BaseArray]) -> &mut [Base] {
442    assert_eq!(align_of::<Base>(), align_of::<BaseArray>());
443    let d = size_of::<BaseArray>() / size_of::<Base>();
444
445    assert!(align_of::<BaseArray>() >= align_of::<Base>());
446    let buf_ptr = buf.as_mut_ptr().cast::<Base>();
447    let n = buf.len() * d;
448    unsafe { slice::from_raw_parts_mut(buf_ptr, n) }
449}
450
451/// Convert a vector of `BaseArray` elements to a vector of `Base` elements without any
452/// reallocations.
453///
454/// This is useful to convert `Vec<[F; N]>` to `Vec<F>` or `Vec<A>` to `Vec<F>` where
455/// `A` has the same size, alignment and memory layout as `[F; N]` for some `N`. It can also,
456/// be used to safely convert `Vec<u32>` to `Vec<F>` if `F` is a `32` bit field
457/// or `Vec<u64>` to `Vec<F>` if `F` is a `64` bit field.
458///
459/// # Safety
460///
461/// This is assumes that `BaseArray` has the same alignment and memory layout as `[Base; N]`.
462/// As Rust guarantees that arrays elements are contiguous in memory and the alignment of
463/// the array is the same as the alignment of its elements, this means that `BaseArray`
464/// must have the same alignment as `Base`.
465///
466/// # Panics
467///
468/// This panics if the size of `BaseArray` is not a multiple of the size of `Base`.
469#[inline]
470pub unsafe fn flatten_to_base<Base, BaseArray>(vec: Vec<BaseArray>) -> Vec<Base> {
471    debug_assert_eq!(align_of::<Base>(), align_of::<BaseArray>());
472
473    assert!(
474        size_of::<BaseArray>() % size_of::<Base>() == 0,
475        "Size of BaseArray (got {}) must be a multiple of the size of Base ({}).",
476        size_of::<BaseArray>(),
477        size_of::<Base>()
478    );
479
480    let d = size_of::<BaseArray>() / size_of::<Base>();
481    // Prevent running `vec`'s destructor so we are in complete control
482    // of the allocation.
483    let mut values = ManuallyDrop::new(vec);
484
485    // Each `Self` is an array of `d` elements, so the length and capacity of
486    // the new vector will be multiplied by `d`.
487    let new_len = values.len() * d;
488    let new_cap = values.capacity() * d;
489
490    // Safe as BaseArray and Base have the same alignment.
491    let ptr = values.as_mut_ptr() as *mut Base;
492
493    unsafe {
494        // Safety:
495        // - BaseArray and Base have the same alignment.
496        // - As size_of::<BaseArray>() == size_of::<Base>() * d:
497        //      -- The capacity of the new vector is equal to the capacity of the old vector.
498        //      -- The first new_len elements of the new vector correspond to the first
499        //         len elements of the old vector and so are properly initialized.
500        Vec::from_raw_parts(ptr, new_len, new_cap)
501    }
502}
503
504/// Convert a vector of `Base` elements to a vector of `BaseArray` elements ideally without any
505/// reallocations.
506///
507/// This is an inverse of `flatten_to_base`. Unfortunately, unlike `flatten_to_base`, it may not be
508/// possible to avoid allocations. This issue is that there is not way to guarantee that the capacity
509/// of the vector is a multiple of `d`.
510///
511/// # Safety
512///
513/// This is assumes that `BaseArray` has the same alignment and memory layout as `[Base; N]`.
514/// As Rust guarantees that arrays elements are contiguous in memory and the alignment of
515/// the array is the same as the alignment of its elements, this means that `BaseArray`
516/// must have the same alignment as `Base`.
517///
518/// # Panics
519///
520/// This panics if the size of `BaseArray` is not a multiple of the size of `Base`.
521/// This panics if the length of the vector is not a multiple of the ratio of the sizes.
522#[inline]
523pub unsafe fn reconstitute_from_base<Base, BaseArray: Clone>(mut vec: Vec<Base>) -> Vec<BaseArray> {
524    assert!(
525        size_of::<BaseArray>() % size_of::<Base>() == 0,
526        "Size of BaseArray (got {}) must be a multiple of the size of Base ({}).",
527        size_of::<BaseArray>(),
528        size_of::<Base>()
529    );
530
531    let d = size_of::<BaseArray>() / size_of::<Base>();
532
533    assert!(
534        vec.len() % d == 0,
535        "Vector length (got {}) must be a multiple of the extension field dimension ({}).",
536        vec.len(),
537        d
538    );
539
540    let new_len = vec.len() / d;
541
542    // We could call vec.shrink_to_fit() here to try and increase the probability that
543    // the capacity is a multiple of d. That might cause a reallocation though which
544    // would defeat the whole purpose.
545    let cap = vec.capacity();
546
547    // The assumption is that basically all callers of `reconstitute_from_base_vec` will be calling it
548    // with a vector constructed from `flatten_to_base` and so the capacity should be a multiple of `d`.
549    // But capacities can do strange things so we need to support both possibilities.
550    // Note that the `else` branch would also work if the capacity is a multiple of `d` but it is slower.
551    if cap % d == 0 {
552        // Prevent running `vec`'s destructor so we are in complete control
553        // of the allocation.
554        let mut values = ManuallyDrop::new(vec);
555
556        // If we are on this branch then the capacity is a multiple of `d`.
557        let new_cap = cap / d;
558
559        // Safe as BaseArray and Base have the same alignment.
560        let ptr = values.as_mut_ptr() as *mut BaseArray;
561
562        unsafe {
563            // Safety:
564            // - BaseArray and Base have the same alignment.
565            // - As size_of::<Base>() == size_of::<BaseArray>() / d:
566            //      -- If we have reached this point, the length and capacity are both divisible by `d`.
567            //      -- The capacity of the new vector is equal to the capacity of the old vector.
568            //      -- The first new_len elements of the new vector correspond to the first
569            //         len elements of the old vector and so are properly initialized.
570            Vec::from_raw_parts(ptr, new_len, new_cap)
571        }
572    } else {
573        // If the capacity is not a multiple of `D`, we go via slices.
574
575        let buf_ptr = vec.as_mut_ptr().cast::<BaseArray>();
576        let slice = unsafe {
577            // Safety:
578            // - BaseArray and Base have the same alignment.
579            // - As size_of::<Base>() == size_of::<BaseArray>() / D:
580            //      -- If we have reached this point, the length is divisible by `D`.
581            //      -- The first new_len elements of the slice correspond to the first
582            //         len elements of the old slice and so are properly initialized.
583            slice::from_raw_parts(buf_ptr, new_len)
584        };
585
586        // Ideally the compiler could optimize this away to avoid the copy but it appears not to.
587        slice.to_vec()
588    }
589}
590
591#[inline(always)]
592pub const fn relatively_prime_u64(mut u: u64, mut v: u64) -> bool {
593    // Check that neither input is 0.
594    if u == 0 || v == 0 {
595        return false;
596    }
597
598    // Check divisibility by 2.
599    if (u | v) & 1 == 0 {
600        return false;
601    }
602
603    // Remove factors of 2 from `u` and `v`
604    u >>= u.trailing_zeros();
605    if u == 1 {
606        return true;
607    }
608
609    while v != 0 {
610        v >>= v.trailing_zeros();
611        if v == 1 {
612            return true;
613        }
614
615        // Ensure u <= v
616        if u > v {
617            core::mem::swap(&mut u, &mut v);
618        }
619
620        // This looks inefficient for v >> u but thanks to the fact that we remove
621        // trailing_zeros of v in every iteration, it ends up much more performative
622        // than first glance implies.
623        v -= u
624    }
625    // If we made it through the loop, at no point is u or v equal to 1 and so the gcd
626    // must be greater than 1.
627    false
628}
629
630#[cfg(test)]
631mod tests {
632    use alloc::vec;
633    use alloc::vec::Vec;
634
635    use rand::rngs::SmallRng;
636    use rand::{Rng, SeedableRng};
637
638    use super::*;
639
640    #[test]
641    fn test_reverse_bits_len() {
642        assert_eq!(reverse_bits_len(0b0000000000, 10), 0b0000000000);
643        assert_eq!(reverse_bits_len(0b0000000001, 10), 0b1000000000);
644        assert_eq!(reverse_bits_len(0b1000000000, 10), 0b0000000001);
645        assert_eq!(reverse_bits_len(0b00000, 5), 0b00000);
646        assert_eq!(reverse_bits_len(0b01011, 5), 0b11010);
647    }
648
649    #[test]
650    fn test_reverse_index_bits() {
651        let mut arg = vec![10, 20, 30, 40];
652        reverse_slice_index_bits(&mut arg);
653        assert_eq!(arg, vec![10, 30, 20, 40]);
654
655        let mut input256: Vec<u64> = (0..256).collect();
656        #[rustfmt::skip]
657        let output256: Vec<u64> = vec![
658            0x00, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0, 0x10, 0x90, 0x50, 0xd0, 0x30, 0xb0, 0x70, 0xf0,
659            0x08, 0x88, 0x48, 0xc8, 0x28, 0xa8, 0x68, 0xe8, 0x18, 0x98, 0x58, 0xd8, 0x38, 0xb8, 0x78, 0xf8,
660            0x04, 0x84, 0x44, 0xc4, 0x24, 0xa4, 0x64, 0xe4, 0x14, 0x94, 0x54, 0xd4, 0x34, 0xb4, 0x74, 0xf4,
661            0x0c, 0x8c, 0x4c, 0xcc, 0x2c, 0xac, 0x6c, 0xec, 0x1c, 0x9c, 0x5c, 0xdc, 0x3c, 0xbc, 0x7c, 0xfc,
662            0x02, 0x82, 0x42, 0xc2, 0x22, 0xa2, 0x62, 0xe2, 0x12, 0x92, 0x52, 0xd2, 0x32, 0xb2, 0x72, 0xf2,
663            0x0a, 0x8a, 0x4a, 0xca, 0x2a, 0xaa, 0x6a, 0xea, 0x1a, 0x9a, 0x5a, 0xda, 0x3a, 0xba, 0x7a, 0xfa,
664            0x06, 0x86, 0x46, 0xc6, 0x26, 0xa6, 0x66, 0xe6, 0x16, 0x96, 0x56, 0xd6, 0x36, 0xb6, 0x76, 0xf6,
665            0x0e, 0x8e, 0x4e, 0xce, 0x2e, 0xae, 0x6e, 0xee, 0x1e, 0x9e, 0x5e, 0xde, 0x3e, 0xbe, 0x7e, 0xfe,
666            0x01, 0x81, 0x41, 0xc1, 0x21, 0xa1, 0x61, 0xe1, 0x11, 0x91, 0x51, 0xd1, 0x31, 0xb1, 0x71, 0xf1,
667            0x09, 0x89, 0x49, 0xc9, 0x29, 0xa9, 0x69, 0xe9, 0x19, 0x99, 0x59, 0xd9, 0x39, 0xb9, 0x79, 0xf9,
668            0x05, 0x85, 0x45, 0xc5, 0x25, 0xa5, 0x65, 0xe5, 0x15, 0x95, 0x55, 0xd5, 0x35, 0xb5, 0x75, 0xf5,
669            0x0d, 0x8d, 0x4d, 0xcd, 0x2d, 0xad, 0x6d, 0xed, 0x1d, 0x9d, 0x5d, 0xdd, 0x3d, 0xbd, 0x7d, 0xfd,
670            0x03, 0x83, 0x43, 0xc3, 0x23, 0xa3, 0x63, 0xe3, 0x13, 0x93, 0x53, 0xd3, 0x33, 0xb3, 0x73, 0xf3,
671            0x0b, 0x8b, 0x4b, 0xcb, 0x2b, 0xab, 0x6b, 0xeb, 0x1b, 0x9b, 0x5b, 0xdb, 0x3b, 0xbb, 0x7b, 0xfb,
672            0x07, 0x87, 0x47, 0xc7, 0x27, 0xa7, 0x67, 0xe7, 0x17, 0x97, 0x57, 0xd7, 0x37, 0xb7, 0x77, 0xf7,
673            0x0f, 0x8f, 0x4f, 0xcf, 0x2f, 0xaf, 0x6f, 0xef, 0x1f, 0x9f, 0x5f, 0xdf, 0x3f, 0xbf, 0x7f, 0xff,
674        ];
675        reverse_slice_index_bits(&mut input256[..]);
676        assert_eq!(input256, output256);
677    }
678
679    #[test]
680    fn test_apply_to_chunks_exact_fit() {
681        const CHUNK_SIZE: usize = 4;
682        let input: Vec<u8> = vec![1, 2, 3, 4, 5, 6, 7, 8];
683        let mut results: Vec<Vec<u8>> = Vec::new();
684
685        apply_to_chunks::<CHUNK_SIZE, _, _>(input, |chunk| {
686            results.push(chunk.to_vec());
687        });
688
689        assert_eq!(results, vec![vec![1, 2, 3, 4], vec![5, 6, 7, 8]]);
690    }
691
692    #[test]
693    fn test_apply_to_chunks_with_remainder() {
694        const CHUNK_SIZE: usize = 3;
695        let input: Vec<u8> = vec![1, 2, 3, 4, 5, 6, 7];
696        let mut results: Vec<Vec<u8>> = Vec::new();
697
698        apply_to_chunks::<CHUNK_SIZE, _, _>(input, |chunk| {
699            results.push(chunk.to_vec());
700        });
701
702        assert_eq!(results, vec![vec![1, 2, 3], vec![4, 5, 6], vec![7]]);
703    }
704
705    #[test]
706    fn test_apply_to_chunks_empty_input() {
707        const CHUNK_SIZE: usize = 4;
708        let input: Vec<u8> = vec![];
709        let mut results: Vec<Vec<u8>> = Vec::new();
710
711        apply_to_chunks::<CHUNK_SIZE, _, _>(input, |chunk| {
712            results.push(chunk.to_vec());
713        });
714
715        assert!(results.is_empty());
716    }
717
718    #[test]
719    fn test_apply_to_chunks_single_chunk() {
720        const CHUNK_SIZE: usize = 10;
721        let input: Vec<u8> = vec![1, 2, 3, 4, 5];
722        let mut results: Vec<Vec<u8>> = Vec::new();
723
724        apply_to_chunks::<CHUNK_SIZE, _, _>(input, |chunk| {
725            results.push(chunk.to_vec());
726        });
727
728        assert_eq!(results, vec![vec![1, 2, 3, 4, 5]]);
729    }
730
731    #[test]
732    fn test_apply_to_chunks_large_chunk_size() {
733        const CHUNK_SIZE: usize = 100;
734        let input: Vec<u8> = vec![1, 2, 3, 4, 5, 6, 7, 8];
735        let mut results: Vec<Vec<u8>> = Vec::new();
736
737        apply_to_chunks::<CHUNK_SIZE, _, _>(input, |chunk| {
738            results.push(chunk.to_vec());
739        });
740
741        assert_eq!(results, vec![vec![1, 2, 3, 4, 5, 6, 7, 8]]);
742    }
743
744    #[test]
745    fn test_apply_to_chunks_large_input() {
746        const CHUNK_SIZE: usize = 5;
747        let input: Vec<u8> = (1..=20).collect();
748        let mut results: Vec<Vec<u8>> = Vec::new();
749
750        apply_to_chunks::<CHUNK_SIZE, _, _>(input, |chunk| {
751            results.push(chunk.to_vec());
752        });
753
754        assert_eq!(
755            results,
756            vec![
757                vec![1, 2, 3, 4, 5],
758                vec![6, 7, 8, 9, 10],
759                vec![11, 12, 13, 14, 15],
760                vec![16, 17, 18, 19, 20]
761            ]
762        );
763    }
764
765    #[test]
766    fn test_reverse_slice_index_bits_random() {
767        let lengths = [32, 128, 1 << 16];
768        let mut rng = SmallRng::seed_from_u64(1);
769        for _ in 0..32 {
770            for &length in &lengths {
771                let mut rand_list: Vec<u32> = Vec::with_capacity(length);
772                rand_list.resize_with(length, || rng.random());
773                let expect = reverse_index_bits_naive(&rand_list);
774
775                let mut actual = rand_list.clone();
776                reverse_slice_index_bits(&mut actual);
777
778                assert_eq!(actual, expect);
779            }
780        }
781    }
782
783    #[test]
784    fn test_log2_strict_usize_edge_cases() {
785        assert_eq!(log2_strict_usize(1), 0);
786        assert_eq!(log2_strict_usize(2), 1);
787        assert_eq!(log2_strict_usize(1 << 18), 18);
788        assert_eq!(log2_strict_usize(1 << 31), 31);
789        assert_eq!(
790            log2_strict_usize(1 << (usize::BITS - 1)),
791            usize::BITS as usize - 1
792        );
793    }
794
795    #[test]
796    #[should_panic]
797    fn test_log2_strict_usize_zero() {
798        let _ = log2_strict_usize(0);
799    }
800
801    #[test]
802    #[should_panic]
803    fn test_log2_strict_usize_nonpower_2() {
804        let _ = log2_strict_usize(0x78c341c65ae6d262);
805    }
806
807    #[test]
808    #[should_panic]
809    fn test_log2_strict_usize_max() {
810        let _ = log2_strict_usize(usize::MAX);
811    }
812
813    #[test]
814    fn test_log2_ceil_usize_comprehensive() {
815        // Powers of 2
816        assert_eq!(log2_ceil_usize(0), 0);
817        assert_eq!(log2_ceil_usize(1), 0);
818        assert_eq!(log2_ceil_usize(2), 1);
819        assert_eq!(log2_ceil_usize(1 << 18), 18);
820        assert_eq!(log2_ceil_usize(1 << 31), 31);
821        assert_eq!(
822            log2_ceil_usize(1 << (usize::BITS - 1)),
823            usize::BITS as usize - 1
824        );
825
826        // Nonpowers; want to round up
827        assert_eq!(log2_ceil_usize(3), 2);
828        assert_eq!(log2_ceil_usize(0x14fe901b), 29);
829        assert_eq!(
830            log2_ceil_usize((1 << (usize::BITS - 1)) + 1),
831            usize::BITS as usize
832        );
833        assert_eq!(log2_ceil_usize(usize::MAX - 1), usize::BITS as usize);
834        assert_eq!(log2_ceil_usize(usize::MAX), usize::BITS as usize);
835    }
836
837    fn reverse_index_bits_naive<T: Copy>(arr: &[T]) -> Vec<T> {
838        let n = arr.len();
839        let n_power = log2_strict_usize(n);
840
841        let mut out = vec![None; n];
842        for (i, v) in arr.iter().enumerate() {
843            let dst = i.reverse_bits() >> (usize::BITS - n_power as u32);
844            out[dst] = Some(*v);
845        }
846
847        out.into_iter().map(|x| x.unwrap()).collect()
848    }
849
850    #[test]
851    fn test_relatively_prime_u64() {
852        // Zero cases (should always return false)
853        assert!(!relatively_prime_u64(0, 0));
854        assert!(!relatively_prime_u64(10, 0));
855        assert!(!relatively_prime_u64(0, 10));
856        assert!(!relatively_prime_u64(0, 123456789));
857
858        // Number with itself (if greater than 1, not relatively prime)
859        assert!(relatively_prime_u64(1, 1));
860        assert!(!relatively_prime_u64(10, 10));
861        assert!(!relatively_prime_u64(99999, 99999));
862
863        // Powers of 2 (always false since they share factor 2)
864        assert!(!relatively_prime_u64(2, 4));
865        assert!(!relatively_prime_u64(16, 32));
866        assert!(!relatively_prime_u64(64, 128));
867        assert!(!relatively_prime_u64(1024, 4096));
868        assert!(!relatively_prime_u64(u64::MAX, u64::MAX));
869
870        // One number is a multiple of the other (always false)
871        assert!(!relatively_prime_u64(5, 10));
872        assert!(!relatively_prime_u64(12, 36));
873        assert!(!relatively_prime_u64(15, 45));
874        assert!(!relatively_prime_u64(100, 500));
875
876        // Co-prime numbers (should be true)
877        assert!(relatively_prime_u64(17, 31));
878        assert!(relatively_prime_u64(97, 43));
879        assert!(relatively_prime_u64(7919, 65537));
880        assert!(relatively_prime_u64(15485863, 32452843));
881
882        // Small prime numbers (should be true)
883        assert!(relatively_prime_u64(13, 17));
884        assert!(relatively_prime_u64(101, 103));
885        assert!(relatively_prime_u64(1009, 1013));
886
887        // Large numbers (some cases where they are relatively prime or not)
888        assert!(!relatively_prime_u64(
889            190266297176832000,
890            10430732356495263744
891        ));
892        assert!(!relatively_prime_u64(
893            2040134905096275968,
894            5701159354248194048
895        ));
896        assert!(!relatively_prime_u64(
897            16611311494648745984,
898            7514969329383038976
899        ));
900        assert!(!relatively_prime_u64(
901            14863931409971066880,
902            7911906750992527360
903        ));
904
905        // Max values
906        assert!(relatively_prime_u64(u64::MAX, 1));
907        assert!(relatively_prime_u64(u64::MAX, u64::MAX - 1));
908        assert!(!relatively_prime_u64(u64::MAX, u64::MAX));
909    }
910}