byteshuffle/
lib.rs

1//! SIMD-accelerated byte shuffle/unshuffle routines
2//!
3//! The byte-shuffle is a very efficient way to improve the compressibility of data that consists
4//! of an array of fixed-size objects.  It rearranges the array in order to group all elements'
5//! least significant bytes together, most-significant bytes together, and everything in between.
6//! Since real applications' arrays often contain consecutive elements that are closely correlated
7//! with each other, this filter frequently results in lengthy continuous runs of identical bytes.
8//! Such runs are highly compressible by general-purpose compression libraries like gzip, lz4, etc.
9//!
10//! The [blosc](https://www.blosc.org) project was the original inspiration for this library.
11//! Blosc is a C library intended primarily for HPC users, and it implements a shuffle filter,
12//! among many other things.  This crate is a clean reimplementation of Blosc's shuffle filter.
13//!
14//! # Examples
15//!
16//! Typical use: a byte array consists of an arithmetic sequence.  Shuffle it, compress it,
17//! decompress it, and then unshuffle it.
18//! ```
19//! # use byteshuffle::*;
20//! const IN: [u8; 8] = [0x00, 0x01, 0x00, 0x02, 0x00, 0x03, 0x00, 0x04];
21//! let shuffled = shuffle(2, &IN);
22//! assert_ne!(IN, &shuffled[..]);
23//! // In normal use, you would now serialize `shuffled`.  Then compress it, and later decompress
24//! // and deserialize it.  Then unshuffle like below.
25//! let unshuffled = unshuffle(2, &shuffled);
26//! assert_eq!(IN, &unshuffled[..]);
27//! ```
28//!
29//! # Crate Features
30//!
31//! This crate has a "nightly" feature.  It enables methods that require the use of types from the
32//! standard library that aren't yet stabilized.
33#![cfg_attr(feature = "nightly", feature(core_io_borrowed_buf))]
34#![cfg_attr(docsrs, feature(doc_cfg))]
35
36#[cfg(feature = "nightly")]
37use std::io::BorrowedCursor;
38use std::{mem, str::FromStr};
39
40use cfg_if::cfg_if;
41use ctor::ctor;
42
43#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
44mod avx2;
45#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
46mod avx512f;
47mod generic;
48#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
49mod sse2;
50
51type LlFunc = unsafe fn(usize, usize, *const u8, *mut u8);
52static mut IMPL: (LlFunc, LlFunc) = (generic::shuffle, generic::unshuffle);
53
54/// Explicitly specify an instruction set to use.
55#[derive(Clone, Copy, Debug, Eq, PartialEq)]
56#[non_exhaustive]
57pub enum SimdImpl {
58    /// Automatically determine the implementation to use
59    Auto,
60    /// Don't use any SIMD accelerations
61    Generic,
62    /// Use the SSE2 instruction set
63    Sse2,
64    /// Use the AVX2 instruction set
65    Avx2,
66    /// Use the AVX512F + AVX512BW instruction sets
67    Avx512F,
68}
69
70#[derive(Clone, Copy, Debug, PartialEq, Eq)]
71pub struct ParseSimdImplErr;
72
73impl FromStr for SimdImpl {
74    type Err = ParseSimdImplErr;
75
76    fn from_str(s: &str) -> Result<Self, Self::Err> {
77        match s.to_lowercase().as_str() {
78            "auto" => Ok(SimdImpl::Auto),
79            "generic" => Ok(SimdImpl::Generic),
80            "sse2" => Ok(SimdImpl::Sse2),
81            "avx2" => Ok(SimdImpl::Avx2),
82            "avx512f" => Ok(SimdImpl::Avx512F),
83            _ => Err(ParseSimdImplErr),
84        }
85    }
86}
87
88#[ctor]
89fn select_implementation_ctor() {
90    // Safe because we're single-threaded before main
91    unsafe { select_implementation(SimdImpl::Auto) }
92}
93
94/// Force the use of a particular CPU instruction set, globally.
95///
96/// The default behavior is to automatically select, which should normally work best.  But this
97/// function may be used to force a lower instruction set for test and benchmarking purposes, or if
98/// one of the higher level optimizations does not work well.
99///
100/// # Safety
101///
102/// May not be called concurrently with any other function in this library.
103pub unsafe fn select_implementation(impl_: SimdImpl) {
104    // Safe because ctor guarantees only one writer at a time
105    cfg_if! {
106        if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] {
107            match impl_ {
108                SimdImpl::Auto => {
109                    if is_x86_feature_detected!("avx512f") && is_x86_feature_detected!("avx512bw"){
110                        unsafe { IMPL = (avx512f::shuffle, avx2::unshuffle); }
111                    } else if is_x86_feature_detected!("avx2") {
112                        unsafe { IMPL = (avx2::shuffle, avx2::unshuffle); }
113                    } else if is_x86_feature_detected!("sse2") {
114                        unsafe { IMPL = (sse2::shuffle, sse2::unshuffle); }
115                    } else {
116                        unsafe { IMPL = (generic::shuffle, generic::unshuffle); }
117                    }
118                },
119                SimdImpl::Generic => unsafe { IMPL = (generic::shuffle, generic::unshuffle); },
120                SimdImpl::Sse2 => unsafe { IMPL = (sse2::shuffle, sse2::unshuffle); },
121                SimdImpl::Avx2 => unsafe { IMPL = (avx2::shuffle, sse2::unshuffle); },
122                SimdImpl::Avx512F => unsafe { IMPL = (avx512f::shuffle, sse2::unshuffle); },
123            }
124        } else {
125            let _ = impl_;
126            unsafe { IMPL = (generic::shuffle, generic::unshuffle); }
127        }
128    }
129}
130
131/// Shuffle an array of fixed-size objects.
132///
133/// The result will be a shuffled byte array.  Be aware that this byte array will usually not be
134/// portable.  It can only be unshuffled by the same computer architecture as the one that shuffled
135/// it, and sometimes only by the same compiler version.  If portability is desired, then first
136/// serialize the data and then use [`shuffle`].
137///
138/// # Examples
139/// ```
140/// # use byteshuffle::*;
141/// const IN: [u16; 4] = [0x01, 0x02, 0x03, 0x04];
142/// let out = shuffle_objects(&IN);
143#[cfg_attr(
144    target_endian = "little",
145    doc = "assert_eq!(out, [0x01, 0x02, 0x03, 0x04, 0x00, 0x00, 0x00, 0x00]);"
146)]
147#[cfg_attr(
148    target_endian = "big",
149    doc = "assert_eq!(out, [0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x03, 0x04]);"
150)]
151/// ```
152pub fn shuffle_objects<T: Copy>(src: &[T]) -> Vec<u8> {
153    let ts = mem::size_of::<T>();
154    assert!(ts > 1, "No point shuffling plain [u8]");
155    let mut dst = Vec::with_capacity(mem::size_of_val(src));
156    // Safe because we src and dst have same length in bytes
157    unsafe {
158        IMPL.0(
159            ts,
160            mem::size_of_val(src),
161            src.as_ptr() as *const u8,
162            dst.as_mut_ptr(),
163        );
164        dst.set_len(mem::size_of_val(src));
165    };
166    dst
167}
168
169/// Shuffle a byte array whose contents are known to approximately repeat with
170/// period `typesize`.
171///
172/// # Examples
173/// ```
174/// # use byteshuffle::*;
175/// const IN: [u8; 8] = [0x00, 0x01, 0x00, 0x02, 0x00, 0x03, 0x00, 0x04];
176/// let out = shuffle(2, &IN);
177/// assert_eq!(out, [0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x03, 0x04]);
178/// ```
179pub fn shuffle(typesize: usize, src: &[u8]) -> Vec<u8> {
180    let mut dst = Vec::with_capacity(src.len());
181    // Safe because src and dst have the same capacity
182    unsafe {
183        IMPL.0(typesize, src.len(), src.as_ptr(), dst.as_mut_ptr());
184        dst.set_len(src.len());
185    }
186    assert_eq!(src.len(), dst.len());
187    dst
188}
189
190/// Like [`shuffle`], but allows the caller to control allocation for the output.
191///
192/// # Examples
193/// ```
194/// # use byteshuffle::*;
195/// const IN: [u8; 8] = [0x00, 0x01, 0x00, 0x02, 0x00, 0x03, 0x00, 0x04];
196/// let mut out = [0u8; 8];
197/// shuffle_into(2, &IN, &mut out);
198/// assert_eq!(out, [0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x03, 0x04]);
199/// ```
200pub fn shuffle_into(typesize: usize, src: &[u8], dst: &mut [u8]) {
201    assert_eq!(src.len(), dst.len());
202    // Safe because src and dst have the same length
203    unsafe {
204        IMPL.0(typesize, src.len(), src.as_ptr(), dst.as_mut_ptr());
205    }
206}
207
208/// Like [`shuffle_into`], but works with uninitialized output buffers.
209///
210/// # Example
211/// ```
212/// #![cfg_attr(feature = "nightly", feature(core_io_borrowed_buf))]
213/// use byteshuffle::*;
214/// use std::io::BorrowedBuf;
215/// use std::mem::MaybeUninit;
216///
217/// const IN: [u8; 8] = [0x00, 0x01, 0x00, 0x02, 0x00, 0x03, 0x00, 0x04];
218/// let mut outvec = [MaybeUninit::uninit(); 8];
219/// let mut buf = BorrowedBuf::from(&mut outvec[..]);
220/// shuffle_buf(2, &IN, buf.unfilled());
221/// assert_eq!(buf.filled(), [0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x03, 0x04]);
222/// ```
223#[cfg(feature = "nightly")]
224#[cfg_attr(docsrs, doc(cfg(feature = "nightly")))]
225pub fn shuffle_buf(typesize: usize, src: &[u8], mut buf: BorrowedCursor<'_>) {
226    assert!(buf.capacity() >= src.len());
227    unsafe {
228        let dst: *mut u8 = buf.as_mut().as_mut_ptr().cast();
229        IMPL.0(typesize, src.len(), src.as_ptr(), dst);
230        buf.advance(src.len());
231    }
232}
233
234/// Unshuffle an array of fixed-size objects.
235///
236/// # Safety
237///
238/// The `src` must've originally been produced by [`shuffle`], by a program that uses the exact
239/// same byte layout as this one.  That means the same wordsize, same endianness, and may even
240/// require the same compiler version.  If you can't guarantee those conditions, then serialize and
241/// use [`shuffle`]/[`unshuffle`] instead.
242///
243/// # Examples
244/// ```
245/// # use byteshuffle::*;
246/// const IN: [u16; 4] = [0x01, 0x02, 0x03, 0x04];
247/// let mut shuffled = shuffle_objects(&IN);
248/// let out = unsafe { unshuffle_objects(&shuffled) };
249/// assert_eq!(IN, &out[..]);
250/// ```
251pub unsafe fn unshuffle_objects<T: Copy>(src: &[u8]) -> Vec<T> {
252    let ts = mem::size_of::<T>();
253    assert!(ts > 1, "No point shuffling plain [u8]");
254    let mut dst = Vec::with_capacity(src.len() / ts);
255    // Safe because we src and dst have same length in bytes
256    unsafe {
257        IMPL.1(
258            ts,
259            mem::size_of_val(src),
260            src.as_ptr(),
261            dst.as_mut_ptr() as *mut u8,
262        );
263        dst.set_len(src.len() / ts);
264    }
265    assert_eq!(mem::size_of_val(src), mem::size_of_val(&dst[..]));
266    dst
267}
268
269/// Unshuffle a byte array whose contents are known to approximately repeat with
270/// period `typesize`.
271///
272/// # Examples
273/// ```
274/// # use byteshuffle::*;
275/// const IN: [u8; 8] = [0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x03, 0x04];
276/// let out = unshuffle(2, &IN);
277/// assert_eq!(out, [0x00, 0x01, 0x00, 0x02, 0x00, 0x03, 0x00, 0x04]);
278/// ```
279pub fn unshuffle(typesize: usize, src: &[u8]) -> Vec<u8> {
280    let mut dst = Vec::with_capacity(src.len());
281    unsafe {
282        IMPL.1(typesize, src.len(), src.as_ptr(), dst.as_mut_ptr());
283        dst.set_len(src.len());
284    }
285    assert_eq!(src.len(), dst.len());
286    dst
287}
288
289/// Like [`unshuffle`], but allows the caller to control allocation for the destination.
290///
291/// # Example
292/// ```
293/// use byteshuffle::*;
294///
295/// const IN: [u8; 8] = [0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x03, 0x04];
296/// let mut out = [0u8; 8];
297/// unshuffle_into(2, &IN, &mut out);
298/// assert_eq!(out, [0x00, 0x01, 0x00, 0x02, 0x00, 0x03, 0x00, 0x04]);
299/// ```
300pub fn unshuffle_into(typesize: usize, src: &[u8], dst: &mut [u8]) {
301    assert_eq!(src.len(), dst.len());
302    unsafe {
303        IMPL.1(typesize, src.len(), src.as_ptr(), dst.as_mut_ptr());
304    }
305}
306
307/// Like [`unshuffle_into`], but works with uninitialized output buffers.
308///
309/// # Example
310/// ```
311/// #![cfg_attr(feature = "nightly", feature(core_io_borrowed_buf))]
312/// use byteshuffle::*;
313/// use std::io::BorrowedBuf;
314/// use std::mem::MaybeUninit;
315///
316/// const IN: [u8; 8] = [0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x03, 0x04];
317/// let mut outvec = [MaybeUninit::uninit(); 8];
318/// let mut buf = BorrowedBuf::from(&mut outvec[..]);
319/// unshuffle_buf(2, &IN, buf.unfilled());
320/// assert_eq!(buf.filled(), [0x00, 0x01, 0x00, 0x02, 0x00, 0x03, 0x00, 0x04]);
321/// ```
322#[cfg(feature = "nightly")]
323#[cfg_attr(docsrs, doc(cfg(feature = "nightly")))]
324pub fn unshuffle_buf(typesize: usize, src: &[u8], mut buf: BorrowedCursor<'_>) {
325    assert!(buf.capacity() >= src.len());
326    unsafe {
327        let dst: *mut u8 = buf.as_mut().as_mut_ptr().cast();
328        IMPL.1(typesize, src.len(), src.as_ptr(), dst);
329        buf.advance(src.len());
330    }
331}
332
333#[cfg(test)]
334mod t {
335    use super::*;
336
337    /// Test the type-based shuffle_objects API
338    mod shuffle_objects {
339        use super::*;
340
341        // Two elements of two bytes each
342        #[test]
343        fn twobytwo() {
344            let src = [0x1234u16, 0x5678u16];
345            let dst = shuffle_objects(&src[..]);
346            cfg_if! {
347                if #[cfg(target_endian = "big")] {
348                    let expected = [0x78u8, 0x34, 0x56, 0x12];
349                } else {
350                    let expected = [0x34, 0x78u8, 0x12, 0x56];
351                }
352            }
353            assert_eq!(dst, &expected[..]);
354        }
355
356        // Four elements of four bytes each
357        #[test]
358        fn fourbyfour() {
359            let src = [0x11223344u32, 0x55667788, 0x99aabbcc, 0xddeeff00];
360            let dst = shuffle_objects(&src[..]);
361            cfg_if! {
362                if #[cfg(target_endian = "big")] {
363                    let expected = [0x00u8, 0xcc, 0x88, 0x44, 0xff, 0xbb, 0x77, 0x33,
364                       0xee, 0xaa, 0x66, 0x22, 0xdd, 0x99, 0x55, 0x11];
365                } else {
366                    let expected = [0x44, 0x88, 0xcc, 0x00, 0x33, 0x77, 0xbb, 0xff,
367                       0x22, 0x66, 0xaa, 0xee, 0x11, 0x55, 0x99, 0xdd];
368                }
369            }
370            assert_eq!(dst, &expected[..]);
371        }
372    }
373
374    mod shuffle {
375        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
376        use rand::Rng;
377        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
378        use rstest::rstest;
379
380        /// Compare optimized results against generic results
381        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
382        #[rstest]
383        #[cfg_attr(any(target_arch = "x86", target_arch = "x86_64"),
384            case::sse2(crate::sse2::shuffle, is_x86_feature_detected!("sse2")))]
385        #[cfg_attr(any(target_arch = "x86", target_arch = "x86_64"),
386            case::avx2(crate::avx2::shuffle, is_x86_feature_detected!("avx2")))]
387        #[cfg_attr(any(target_arch = "x86", target_arch = "x86_64"),
388            case::avx512f(crate::avx512f::shuffle,
389                        is_x86_feature_detected!("avx512f") && is_x86_feature_detected!("avx512bw")
390            )
391        )]
392        fn compare(
393            #[values(2, 4, 8, 13, 16, 18, 32, 36, 43, 47)] typesize: usize,
394            #[values(64, 65, 256, 258, 1024, 1028, 4096, 4112)] len: usize,
395            #[case] f: unsafe fn(usize, usize, *const u8, *mut u8),
396            #[case] has_feature: bool,
397        ) {
398            if !has_feature {
399                eprintln!("Skipping: CPU feature unavailable.");
400                return;
401            }
402
403            let mut rng = rand::rng();
404
405            let src = (0..len).map(|_| rng.random()).collect::<Vec<u8>>();
406            let mut generic_dst = vec![0u8; len];
407            let mut opt_dst = vec![0u8; len];
408            unsafe {
409                crate::generic::shuffle(typesize, len, src.as_ptr(), generic_dst.as_mut_ptr());
410                f(typesize, len, src.as_ptr(), opt_dst.as_mut_ptr());
411            }
412            assert_eq!(generic_dst, opt_dst);
413        }
414    }
415
416    /// Test the type-based shuffle API
417    mod unshuffle_objects {
418        use super::*;
419
420        // Two elements of two bytes each
421        #[test]
422        fn twobytwo() {
423            cfg_if! {
424                if #[cfg(target_endian = "big")] {
425                    let src = [0x78u8, 0x34, 0x56, 0x12];
426                } else {
427                    let src = [0x34, 0x78u8, 0x12, 0x56];
428                }
429            }
430            let dst = unsafe { unshuffle_objects::<u16>(&src[..]) };
431            assert_eq!(dst, &[0x1234u16, 0x5678u16][..]);
432        }
433
434        // Four elements of four bytes each
435        #[test]
436        fn fourbyfour() {
437            cfg_if! {
438                if #[cfg(target_endian = "big")] {
439                    let src = [0x00u8, 0xcc, 0x88, 0x44, 0xff, 0xbb, 0x77, 0x33,
440                       0xee, 0xaa, 0x66, 0x22, 0xdd, 0x99, 0x55, 0x11];
441                } else {
442                    let src = [0x44, 0x88, 0xcc, 0x00, 0x33, 0x77, 0xbb, 0xff,
443                       0x22, 0x66, 0xaa, 0xee, 0x11, 0x55, 0x99, 0xdd];
444                }
445            }
446            let dst = unsafe { unshuffle_objects::<u32>(&src[..]) };
447            assert_eq!(
448                dst,
449                &[0x11223344u32, 0x55667788, 0x99aabbcc, 0xddeeff00][..]
450            );
451        }
452    }
453
454    mod unshuffle {
455        use rand::Rng;
456        use rstest::rstest;
457
458        /// Compare optimized results against generic results
459        #[rstest]
460        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
461        #[cfg_attr(any(target_arch = "x86", target_arch = "x86_64"),
462            case::sse2(crate::sse2::unshuffle, is_x86_feature_detected!("sse2")))]
463        #[cfg_attr(any(target_arch = "x86", target_arch = "x86_64"),
464            case::avx2(crate::avx2::unshuffle, is_x86_feature_detected!("avx2")))]
465        fn compare(
466            #[values(2, 4, 8, 16, 18, 32, 36, 43, 47)] typesize: usize,
467            #[values(64, 65, 256, 258, 1024, 1028, 4096, 4112)] len: usize,
468            #[case] f: unsafe fn(usize, usize, *const u8, *mut u8),
469            #[case] has_feature: bool,
470        ) {
471            if !has_feature {
472                eprintln!("Skipping: CPU feature unavailable.");
473                return;
474            }
475
476            let mut rng = rand::rng();
477
478            let src = (0..len).map(|_| rng.random()).collect::<Vec<u8>>();
479            let mut generic_dst = vec![0u8; len];
480            let mut opt_dst = vec![0u8; len];
481            unsafe {
482                crate::generic::unshuffle(typesize, len, src.as_ptr(), generic_dst.as_mut_ptr());
483                f(typesize, len, src.as_ptr(), opt_dst.as_mut_ptr());
484            }
485            assert_eq!(generic_dst, opt_dst);
486        }
487
488        /// unshuffle should be the inverse of shuffle
489        #[rstest]
490        fn inverse(
491            #[values(2, 4, 8, 16, 18, 32, 36, 43, 47)] typesize: usize,
492            #[values(64, 65, 256, 258, 1024, 1028, 4096, 4112)] len: usize,
493        ) {
494            let mut rng = rand::rng();
495
496            let src = (0..len).map(|_| rng.random()).collect::<Vec<u8>>();
497            let mut shuffled = vec![0u8; len];
498            let mut dst = vec![0u8; len];
499            unsafe {
500                crate::generic::shuffle(typesize, len, src.as_ptr(), shuffled.as_mut_ptr());
501                crate::generic::unshuffle(typesize, len, shuffled.as_ptr(), dst.as_mut_ptr());
502            }
503            assert_eq!(src, dst);
504        }
505    }
506}