varint_simd/decode/
mod.rs

1#[cfg(target_arch = "x86")]
2use core::arch::x86::*;
3#[cfg(target_arch = "x86_64")]
4use core::arch::x86_64::*;
5use core::cmp::min;
6
7use crate::num::{SignedVarIntTarget, VarIntTarget};
8use crate::VarIntDecodeError;
9
10mod lookup;
11
12/// Decodes a single varint from the input slice.
13///
14/// Produces a tuple containing the decoded number and the number of bytes read. For best
15/// performance, provide a slice at least 16 bytes in length, or use the unsafe version directly.
16///
17/// # Examples
18/// ```
19/// use varint_simd::{decode, VarIntDecodeError};
20///
21/// fn main() -> Result<(), VarIntDecodeError> {
22///     let decoded = decode::<u32>(&[185, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])?;
23///     assert_eq!(decoded, (1337, 2));
24///     Ok(())
25/// }
26/// ```
27#[inline]
28pub fn decode<T: VarIntTarget>(bytes: &[u8]) -> Result<(T, usize), VarIntDecodeError> {
29    let result = if bytes.len() >= 16 {
30        unsafe { decode_unsafe(bytes.as_ptr()) }
31    } else if !bytes.is_empty() {
32        let mut data = [0u8; 16];
33        let len = min(16, bytes.len());
34        // unsafe { core::ptr::copy_nonoverlapping(bytes.as_ptr(), data.as_mut_ptr(), len); }
35        data[..len].copy_from_slice(&bytes[..len]);
36        unsafe { decode_unsafe(data.as_ptr()) }
37    } else {
38        return Err(VarIntDecodeError::NotEnoughBytes);
39    };
40
41    // The ordering of conditions here is weird because of a performance regression (?) in rustc 1.49
42    if bytes.len() >= T::MAX_VARINT_BYTES as usize
43        // we perform a signed comparison here because a valid last byte is always positive
44        && unsafe { *bytes.get_unchecked((T::MAX_VARINT_BYTES - 1) as usize) } > T::MAX_LAST_VARINT_BYTE
45        && result.1 == T::MAX_VARINT_BYTES as usize
46        || result.1 > T::MAX_VARINT_BYTES as usize
47    {
48        Err(VarIntDecodeError::Overflow)
49    } else if result.1 > bytes.len() {
50        Err(VarIntDecodeError::NotEnoughBytes)
51    } else {
52        Ok(result)
53    }
54}
55
56/// Decodes only the length of a single variant from the input slice.
57///
58/// # Examples
59/// ```
60/// use varint_simd::{decode_len, VarIntDecodeError};
61///
62/// fn main() -> Result<(), VarIntDecodeError> {
63///     let decoded = decode_len::<u32>(&[185, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])?;
64///     assert_eq!(decoded, 2);
65///     Ok(())
66/// }
67/// ```
68#[inline]
69pub fn decode_len<T: VarIntTarget>(bytes: &[u8]) -> Result<usize, VarIntDecodeError> {
70    let result = if bytes.len() >= 16 {
71        unsafe { decode_len_unsafe::<T>(bytes.as_ptr()) }
72    } else if !bytes.is_empty() {
73        let mut data = [0u8; 16];
74        let len = min(16, bytes.len());
75        // unsafe { core::ptr::copy_nonoverlapping(bytes.as_ptr(), data.as_mut_ptr(), len); }
76        data[..len].copy_from_slice(&bytes[..len]);
77        unsafe { decode_len_unsafe::<T>(data.as_ptr()) }
78    } else {
79        return Err(VarIntDecodeError::NotEnoughBytes);
80    };
81
82    Ok(result)
83}
84
85/// Convenience function for decoding a single varint in ZigZag format from the input slice.
86/// See also: [`decode`]
87///
88/// # Examples
89/// ```
90/// use varint_simd::{decode_zigzag, VarIntDecodeError};
91///
92/// fn main() -> Result<(), VarIntDecodeError> {
93///     let decoded = decode_zigzag::<i32>(&[39, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])?;
94///     assert_eq!(decoded, (-20, 1));
95///     Ok(())
96/// }
97/// ```
98#[inline]
99pub fn decode_zigzag<T: SignedVarIntTarget>(bytes: &[u8]) -> Result<(T, usize), VarIntDecodeError> {
100    decode::<T::Unsigned>(bytes).map(|r| (r.0.unzigzag(), r.1))
101}
102
103/// Decodes the length of the next integer
104///
105/// # Safety
106/// Same as `decode_unsafe`
107#[inline]
108pub unsafe fn decode_len_unsafe<T: VarIntTarget>(bytes: *const u8) -> usize {
109    if T::MAX_VARINT_BYTES <= 5 {
110        let b = bytes.cast::<u64>().read_unaligned();
111        let msbs = !b & !0x7f7f7f7f7f7f7f7f;
112        let len = msbs.trailing_zeros() + 1; // in bits
113        (len / 8) as usize
114    } else {
115        let b0 = bytes.cast::<u64>().read_unaligned();
116        let b1 = bytes.cast::<u64>().add(1).read_unaligned();
117
118        let msbs0 = !b0 & !0x7f7f7f7f7f7f7f7f;
119        let msbs1 = !b1 & !0x7f7f7f7f7f7f7f7f;
120
121        let len0 = msbs0.trailing_zeros() + 1;
122        let len1 = msbs1.trailing_zeros() + 1;
123
124        let len = if msbs0 == 0 { len1 + 64 } else { len0 };
125        len as usize / 8
126    }
127}
128
129/// Decodes a single varint from the input pointer. Returns a tuple containing the decoded number
130/// and the number of bytes read.
131///
132/// # Safety
133/// There must be at least 16 bytes of allocated memory after the beginning of the pointer.
134/// Otherwise, there may be undefined behavior. Any data after the end of the varint are ignored.
135/// A truncated value will be returned if the varint represents a number too large for the target
136/// type.
137///
138/// You may prefer to use this unsafe interface if you know what you are doing and need a little
139/// extra performance.
140#[inline]
141pub unsafe fn decode_unsafe<T: VarIntTarget>(bytes: *const u8) -> (T, usize) {
142    // It looks like you're trying to understand what this code does. You should probably read
143    // this first: https://developers.google.com/protocol-buffers/docs/encoding#varints
144
145    if T::MAX_VARINT_BYTES <= 5 {
146        // we can do everything in a normal 64-bit register
147        let b = bytes.cast::<u64>().read_unaligned();
148        // println!("{:#066b} b", b);
149
150        // println!("{:#066b} op", !0x7f7f7f7f7f7f7f7fu64);
151        let msbs = !b & !0x7f7f7f7f7f7f7f7f;
152        // println!("{:#066b} msbs", msbs);
153        /*
154        TODO: theoretically, we could delay the `+1` and/or do it in parallel with other parts, but
155         moving it downwards absolutely tanks performance and I have no idea why
156        */
157        let len = msbs.trailing_zeros() + 1; // in bits
158
159        // println!("{}", len);
160
161        // b & blsmsk(msbs)
162        let varint_part = b & (msbs ^ msbs.wrapping_sub(1));
163        // println!("{:#066b} varint_part", varint_part);
164
165        let num = T::scalar_to_num(varint_part);
166
167        (num, (len / 8) as usize)
168    } else {
169        let b0 = bytes.cast::<u64>().read_unaligned();
170        let b1 = bytes.cast::<u64>().add(1).read_unaligned();
171
172        let msbs0 = !b0 & !0x7f7f7f7f7f7f7f7f;
173        let msbs1 = !b1 & !0x7f7f7f7f7f7f7f7f;
174
175        // TODO: could this be faster on CPUs without fast tzcnt?
176        // let blsi0 = msbs0.wrapping_neg() & msbs0;
177        // let blsi1 = msbs1.wrapping_neg() & msbs1;
178        //
179        // let len0 = ((blsi0.wrapping_mul(0x20406080a0c0e1)) >> 60) & 15;
180        // let len1 = ((blsi1.wrapping_mul(0x20406080a0c0e1)) >> 60) & 15;
181
182        let len0 = msbs0.trailing_zeros() + 1;
183        let len1 = msbs1.trailing_zeros() + 1;
184
185        // doing this is faster than using len0, len1 because tzcnt has significant latency
186        // and if the caller does not need the length, the call can be optimized out entirely
187        // b0 & blsmsk(msbs0)
188        let varint_part0 = b0 & (msbs0 ^ msbs0.wrapping_sub(1));
189        // b1 & blsmsk(msbs1)
190        let varint_part1 = (b1 & (msbs1 ^ msbs1.wrapping_sub(1))) * ((msbs0 == 0) as u64);
191
192        // let varint_part0 = b0 & !(0xffffffffffffffff << len0.min(63));
193        // let varint_part1 = b1 & !(0xffffffffffffffff << (((msbs0 == 0) as u32) * len1.min(63)));
194
195        let num = T::vector_to_num(core::mem::transmute::<[u64; 2], [u8; 16]>([
196            varint_part0,
197            varint_part1,
198        ]));
199        let len = if msbs0 == 0 { len1 + 64 } else { len0 } / 8;
200
201        (num, len as usize)
202    }
203}
204
205/// Decodes two adjacent varints simultaneously. Target types must fit within 16 bytes when varint
206/// encoded. Requires SSSE3 support.
207///
208/// For example, it is permissible to decode `u32` and `u32`, and `u64` and `u32`, but it is not
209/// possible to decode two `u64` values with this function simultaneously.
210///
211/// Returns a tuple containing the two decoded values and the two lengths of bytes read for each
212/// value.
213///
214/// For best performance, ensure each target type is `u32` or smaller.
215///
216/// # Safety
217/// There must be at least 16 bytes of allocated memory after the start of the pointer. Otherwise,
218/// there may be undefined behavior. Any data after the two varints are ignored. Truncated values
219/// will be returned if a varint exceeds the target type's limit.
220#[inline]
221#[cfg(any(target_feature = "ssse3", doc))]
222#[cfg_attr(rustc_nightly, doc(cfg(target_feature = "ssse3")))]
223pub unsafe fn decode_two_unsafe<T: VarIntTarget, U: VarIntTarget>(
224    bytes: *const u8,
225) -> (T, U, u8, u8) {
226    if T::MAX_VARINT_BYTES + U::MAX_VARINT_BYTES > 16 {
227        // check will be eliminated at compile time
228        panic!(
229            "exceeded length limit: cannot decode {} and {}, total length {} exceeds 16 bytes",
230            core::any::type_name::<T>(),
231            core::any::type_name::<U>(),
232            T::MAX_VARINT_BYTES + U::MAX_VARINT_BYTES
233        );
234    }
235
236    if T::MAX_VARINT_BYTES <= 5 && U::MAX_VARINT_BYTES <= 5 {
237        // This will work with our lookup table, use that version
238        return decode_two_u32_unsafe(bytes);
239    }
240
241    let b = _mm_loadu_si128(bytes as *const __m128i);
242
243    // First find where the boundaries are
244    let bitmask = _mm_movemask_epi8(b) as u32;
245
246    // Find the number of bytes taken up by each varint
247    let bm_not = !bitmask;
248    let first_len = bm_not.trailing_zeros() + 1; // should compile to bsf or tzcnt
249    let bm_not_2 = bm_not >> first_len;
250    let second_len = bm_not_2.trailing_zeros() + 1;
251
252    let ascend = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
253
254    let first_len_vec = _mm_set1_epi8(first_len as i8);
255    let first_mask = _mm_cmplt_epi8(ascend, first_len_vec);
256    let first = _mm_and_si128(b, first_mask);
257
258    let second_shuf = _mm_add_epi8(ascend, first_len_vec);
259    let second_shuffled = _mm_shuffle_epi8(b, second_shuf);
260    let second_mask = _mm_cmplt_epi8(ascend, _mm_set1_epi8(second_len as i8));
261    let second = _mm_and_si128(second_shuffled, second_mask);
262
263    let first_num;
264    let second_num;
265
266    // Only use "turbo" mode if the numbers fit in 64-bit lanes
267    let should_turbo = T::MAX_VARINT_BYTES <= 8
268        && U::MAX_VARINT_BYTES <= 8
269        && cfg!(not(all(target_feature = "bmi2", very_fast_pdep)));
270    if should_turbo {
271        // const, so optimized out
272        let comb = _mm_or_si128(first, _mm_bslli_si128(second, 8));
273
274        let x = if T::MAX_VARINT_BYTES <= 2 && U::MAX_VARINT_BYTES <= 2 {
275            dual_u8_stage2(comb)
276        } else if T::MAX_VARINT_BYTES <= 3 && U::MAX_VARINT_BYTES <= 3 {
277            dual_u16_stage2(comb)
278        } else {
279            dual_u32_stage2(comb)
280        };
281
282        let x: [u32; 4] = core::mem::transmute(x);
283        // _mm_extract_epi32 requires SSE4.1
284        first_num = T::cast_u32(x[0]);
285        second_num = U::cast_u32(x[2]);
286    } else {
287        first_num = T::vector_to_num(core::mem::transmute::<__m128i, [u8; 16]>(first));
288        second_num = U::vector_to_num(core::mem::transmute::<__m128i, [u8; 16]>(second));
289    }
290
291    (first_num, second_num, first_len as u8, second_len as u8)
292}
293
294#[inline]
295#[cfg(any(target_feature = "ssse3", doc))]
296unsafe fn decode_two_u32_unsafe<T: VarIntTarget, U: VarIntTarget>(
297    bytes: *const u8,
298) -> (T, U, u8, u8) {
299    let b = _mm_loadu_si128(bytes as *const __m128i);
300
301    // Get the movemask and mask out irrelevant parts
302    let bitmask = _mm_movemask_epi8(b) as u32 & 0b1111111111;
303
304    // Use lookup table to get the shuffle mask
305    let (lookup, first_len, second_len) =
306        *lookup::LOOKUP_DOUBLE_STEP1.get_unchecked(bitmask as usize);
307    let shuf = *lookup::LOOKUP_DOUBLE_VEC.get_unchecked(lookup as usize);
308
309    let comb = _mm_shuffle_epi8(b, shuf);
310
311    let first_num;
312    let second_num;
313
314    // Only use "turbo" mode if PDEP/PEXT are not faster
315    let should_turbo = cfg!(not(all(target_feature = "bmi2", very_fast_pdep)));
316    if should_turbo {
317        // const, so optimized out
318
319        let x = if T::MAX_VARINT_BYTES <= 2 && U::MAX_VARINT_BYTES <= 2 {
320            dual_u8_stage2(comb)
321        } else if T::MAX_VARINT_BYTES <= 3 && U::MAX_VARINT_BYTES <= 3 {
322            dual_u16_stage2(comb)
323        } else {
324            dual_u32_stage2(comb)
325        };
326
327        let x: [u32; 4] = core::mem::transmute(x);
328        // _mm_extract_epi32 requires SSE4.1
329        first_num = T::cast_u32(x[0]);
330        second_num = U::cast_u32(x[2]);
331    } else {
332        first_num = T::vector_to_num(core::mem::transmute::<__m128i, [u8; 16]>(comb));
333        second_num = U::vector_to_num(core::mem::transmute::<__m128i, [u8; 16]>(_mm_bsrli_si128(
334            comb, 8,
335        )));
336    }
337
338    (first_num, second_num, first_len, second_len)
339}
340
341#[inline(always)]
342unsafe fn dual_u8_stage2(comb: __m128i) -> __m128i {
343    _mm_or_si128(
344        _mm_and_si128(comb, _mm_set_epi64x(0x000000000000007f, 0x000000000000007f)),
345        _mm_srli_epi64(
346            _mm_and_si128(comb, _mm_set_epi64x(0x0000000000000100, 0x0000000000000100)),
347            1,
348        ),
349    )
350}
351
352#[inline(always)]
353unsafe fn dual_u16_stage2(comb: __m128i) -> __m128i {
354    _mm_or_si128(
355        _mm_or_si128(
356            _mm_and_si128(comb, _mm_set_epi64x(0x000000000000007f, 0x000000000000007f)),
357            _mm_srli_epi64(
358                _mm_and_si128(comb, _mm_set_epi64x(0x0000000000030000, 0x0000000000030000)),
359                2,
360            ),
361        ),
362        _mm_srli_epi64(
363            _mm_and_si128(comb, _mm_set_epi64x(0x0000000000007f00, 0x0000000000007f00)),
364            1,
365        ),
366    )
367}
368
369#[inline(always)]
370unsafe fn dual_u32_stage2(comb: __m128i) -> __m128i {
371    _mm_or_si128(
372        _mm_or_si128(
373            _mm_and_si128(comb, _mm_set_epi64x(0x000000000000007f, 0x000000000000007f)),
374            _mm_srli_epi64(
375                _mm_and_si128(comb, _mm_set_epi64x(0x0000000f00000000, 0x0000000f00000000)),
376                4,
377            ),
378        ),
379        _mm_or_si128(
380            _mm_or_si128(
381                _mm_srli_epi64(
382                    _mm_and_si128(comb, _mm_set_epi64x(0x000000007f000000, 0x000000007f000000)),
383                    3,
384                ),
385                _mm_srli_epi64(
386                    _mm_and_si128(comb, _mm_set_epi64x(0x00000000007f0000, 0x00000000007f0000)),
387                    2,
388                ),
389            ),
390            _mm_srli_epi64(
391                _mm_and_si128(comb, _mm_set_epi64x(0x0000000000007f00, 0x0000000000007f00)),
392                1,
393            ),
394        ),
395    )
396}
397
398/// **Experimental. May have relatively poor performance.** Decode two adjacent varints
399/// simultaneously from the input pointer. Requires AVX2. Allows for decoding a pair of `u64`
400/// values. For smaller values, the non-wide variation of this function will probably be faster.
401///
402/// Returns a tuple containing the two decoded values and the two lengths of bytes read for each
403/// value.
404///
405/// # Safety
406/// There must be at least 32 bytes of allocated memory after the beginning of the pointer.
407/// Otherwise, there may be undefined behavior. Calling code should ensure that AVX2 is supported
408/// before referencing this function.
409#[inline]
410#[cfg(any(target_feature = "avx2", doc))]
411#[cfg_attr(rustc_nightly, doc(cfg(target_feature = "avx2")))]
412pub unsafe fn decode_two_wide_unsafe<T: VarIntTarget, U: VarIntTarget>(
413    bytes: *const u8,
414) -> (T, U, u8, u8) {
415    let b = _mm256_loadu_si256(bytes as *const __m256i);
416
417    // Get the most significant bits
418    let bitmask = _mm256_movemask_epi8(b) as u32;
419
420    // Find the number of bytes taken up by each varint
421    let bm_not = !bitmask;
422    let first_len = bm_not.trailing_zeros() + 1; // should compile to bsf or tzcnt
423    let bm_not_2 = bm_not >> first_len;
424    let second_len = bm_not_2.trailing_zeros() + 1;
425
426    // Create and parse vector consisting solely of the first varint
427    let ascend = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
428    let first_mask = _mm_cmplt_epi8(ascend, _mm_set1_epi8(first_len as i8));
429    let first = _mm_and_si128(_mm256_extracti128_si256(b, 0), first_mask);
430
431    // The second is much more tricky.
432    let shuf_gen = _mm256_setr_epi8(
433        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
434        12, 13, 14, 15,
435    );
436
437    // Rearrange each 128-bit lane such that ORing them together results in the window of data we want)
438    let shuf_add = _mm256_set_m128i(
439        _mm_set1_epi8(-(16i8 - first_len as i8)),
440        _mm_set1_epi8(first_len as i8),
441    );
442    let shuf_added = _mm256_add_epi8(shuf_gen, shuf_add);
443    let shuf = _mm256_or_si256(
444        shuf_added,
445        _mm256_cmpgt_epi8(shuf_added, _mm256_set1_epi8(15)), // TODO: Is this really necessary?
446    );
447    let shuffled = _mm256_shuffle_epi8(b, shuf);
448
449    // OR the halves together, and now we have a view of the second varint
450    let second_shifted = _mm_or_si128(
451        _mm256_extracti128_si256(shuffled, 0),
452        _mm256_extracti128_si256(shuffled, 1),
453    );
454    let second_mask = _mm_cmplt_epi8(ascend, _mm_set1_epi8(second_len as i8));
455    let second = _mm_and_si128(second_shifted, second_mask);
456
457    let first_num;
458    let second_num;
459
460    // PEXT on the two halves is still slower, at least on Coffee Lake and Broadwell
461    let should_turbo = true;
462    if should_turbo {
463        // Decode the two halves in parallel using SSE2
464        let comb_lo = _mm_unpacklo_epi64(first, second);
465        let x_lo = _mm_or_si128(
466            _mm_or_si128(
467                _mm_or_si128(
468                    _mm_and_si128(comb_lo, _mm_set1_epi64x(0x000000000000007f)),
469                    _mm_srli_epi64(
470                        _mm_and_si128(comb_lo, _mm_set1_epi64x(0x7f00000000000000)),
471                        7,
472                    ),
473                ),
474                _mm_or_si128(
475                    _mm_srli_epi64(
476                        _mm_and_si128(comb_lo, _mm_set1_epi64x(0x007f000000000000)),
477                        6,
478                    ),
479                    _mm_srli_epi64(
480                        _mm_and_si128(comb_lo, _mm_set1_epi64x(0x00007f0000000000)),
481                        5,
482                    ),
483                ),
484            ),
485            _mm_or_si128(
486                _mm_or_si128(
487                    _mm_srli_epi64(
488                        _mm_and_si128(comb_lo, _mm_set1_epi64x(0x0000007f00000000)),
489                        4,
490                    ),
491                    _mm_srli_epi64(
492                        _mm_and_si128(comb_lo, _mm_set1_epi64x(0x000000007f000000)),
493                        3,
494                    ),
495                ),
496                _mm_or_si128(
497                    _mm_srli_epi64(
498                        _mm_and_si128(comb_lo, _mm_set1_epi64x(0x00000000007f0000)),
499                        2,
500                    ),
501                    _mm_srli_epi64(
502                        _mm_and_si128(comb_lo, _mm_set1_epi64x(0x0000000000007f00)),
503                        1,
504                    ),
505                ),
506            ),
507        );
508
509        let comb_hi = _mm_unpackhi_epi64(first, second);
510        let x_hi = _mm_or_si128(
511            _mm_slli_epi64(
512                _mm_and_si128(comb_hi, _mm_set1_epi64x(0x0000000000000100)),
513                55,
514            ),
515            _mm_slli_epi64(
516                _mm_and_si128(comb_hi, _mm_set1_epi64x(0x000000000000007f)),
517                56,
518            ),
519        );
520
521        let x = _mm_or_si128(x_lo, x_hi);
522
523        first_num = T::cast_u64(_mm_extract_epi64(x, 0) as u64);
524        second_num = U::cast_u64(_mm_extract_epi64(x, 1) as u64);
525    } else {
526        first_num = T::vector_to_num(core::mem::transmute::<__m128i, [u8; 16]>(first));
527        second_num = U::vector_to_num(core::mem::transmute::<__m128i, [u8; 16]>(second));
528    }
529
530    (first_num, second_num, first_len as u8, second_len as u8)
531}
532
533/// Decodes four adjacent varints simultaneously. Target types must fit within 16 bytes when varint
534/// encoded. Requires SSSE3 support.
535///
536/// Returns a tuple containing the four encoded values, followed by the number of bytes read for
537/// each encoded value, followed by a boolean indicator for whether the length values may be
538/// incorrect due to overflow.
539///
540/// For best performance, ensure each target type is `u16` or smaller.
541///
542/// # Safety
543/// There must be at least 16 bytes of allocated memory after the start of the pointer. Otherwise,
544/// there may be undefined behavior. Any data after the four varints are ignored. Truncated values
545/// will be returned if a varint exceeds the target type's limit.
546#[inline]
547#[cfg(any(target_feature = "ssse3", doc))]
548#[cfg_attr(rustc_nightly, doc(cfg(target_feature = "ssse3")))]
549pub unsafe fn decode_four_unsafe<
550    T: VarIntTarget,
551    U: VarIntTarget,
552    V: VarIntTarget,
553    W: VarIntTarget,
554>(
555    bytes: *const u8,
556) -> (T, U, V, W, u8, u8, u8, u8, bool) {
557    if T::MAX_VARINT_BYTES + U::MAX_VARINT_BYTES + V::MAX_VARINT_BYTES + W::MAX_VARINT_BYTES > 16 {
558        // check will be eliminated at compile time
559        panic!(
560            "exceeded length limit: cannot decode {}, {}, {}, and {}, total length {} exceeds 16 bytes",
561            core::any::type_name::<T>(),
562            core::any::type_name::<U>(),
563            core::any::type_name::<V>(),
564            core::any::type_name::<W>(),
565            T::MAX_VARINT_BYTES + U::MAX_VARINT_BYTES + V::MAX_VARINT_BYTES + W::MAX_VARINT_BYTES
566        );
567    }
568
569    if T::MAX_VARINT_BYTES <= 3
570        && U::MAX_VARINT_BYTES <= 3
571        && V::MAX_VARINT_BYTES <= 3
572        && W::MAX_VARINT_BYTES <= 3
573    {
574        return decode_four_u16_unsafe(bytes);
575    }
576
577    let b = _mm_loadu_si128(bytes as *const __m128i);
578
579    // First find where the boundaries are
580    let bitmask = _mm_movemask_epi8(b) as u32;
581
582    // Find the number of bytes taken up by each varint
583    let bm_not = !bitmask;
584    let first_len = bm_not.trailing_zeros() + 1; // should compile to bsf or tzcnt
585    let bm_not_2 = bm_not >> first_len;
586    let second_len = bm_not_2.trailing_zeros() + 1;
587    let bm_not_3 = bm_not_2 >> second_len;
588    let third_len = bm_not_3.trailing_zeros() + 1;
589    let bm_not_4 = bm_not_3 >> third_len;
590    let fourth_len = bm_not_4.trailing_zeros() + 1;
591
592    let ascend = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
593
594    let first_len_vec = _mm_set1_epi8(first_len as i8);
595    let first_mask = _mm_cmplt_epi8(ascend, first_len_vec);
596    let first = _mm_and_si128(b, first_mask);
597
598    let second_shuf = _mm_add_epi8(ascend, first_len_vec);
599    let second_shuffled = _mm_shuffle_epi8(b, second_shuf);
600    let second_len_vec = _mm_set1_epi8(second_len as i8);
601    let second_mask = _mm_cmplt_epi8(ascend, second_len_vec);
602    let second = _mm_and_si128(second_shuffled, second_mask);
603
604    let third_shuf = _mm_add_epi8(ascend, second_len_vec);
605    let third_shuffled = _mm_shuffle_epi8(second_shuffled, third_shuf);
606    let third_len_vec = _mm_set1_epi8(third_len as i8);
607    let third_mask = _mm_cmplt_epi8(ascend, third_len_vec);
608    let third = _mm_and_si128(third_shuffled, third_mask);
609
610    let fourth_shuf = _mm_add_epi8(ascend, third_len_vec);
611    let fourth_shuffled = _mm_shuffle_epi8(third_shuffled, fourth_shuf);
612    let fourth_len_vec = _mm_set1_epi8(fourth_len as i8);
613    let fourth_mask = _mm_cmplt_epi8(ascend, fourth_len_vec);
614    let fourth = _mm_and_si128(fourth_shuffled, fourth_mask);
615
616    let first_num;
617    let second_num;
618    let third_num;
619    let fourth_num;
620
621    // Only use "turbo" mode if the numbers fit in 64-bit lanes
622    let should_turbo = T::MAX_VARINT_BYTES <= 4
623        && U::MAX_VARINT_BYTES <= 4
624        && V::MAX_VARINT_BYTES <= 4
625        && W::MAX_VARINT_BYTES <= 4
626        // PDEP/PEXT are still a little faster here
627        && cfg!(not(all(
628            target_feature = "bmi2",
629            very_fast_pdep
630        )));
631    if should_turbo {
632        // const, so optimized out
633        let comb = _mm_or_si128(
634            _mm_or_si128(first, _mm_bslli_si128(second, 4)),
635            _mm_or_si128(_mm_bslli_si128(third, 8), _mm_bslli_si128(fourth, 12)),
636        );
637
638        let x = if T::MAX_VARINT_BYTES <= 2
639            && U::MAX_VARINT_BYTES <= 2
640            && V::MAX_VARINT_BYTES <= 2
641            && W::MAX_VARINT_BYTES <= 2
642        {
643            _mm_or_si128(
644                _mm_and_si128(comb, _mm_set1_epi32(0x0000007f)),
645                _mm_srli_epi32(_mm_and_si128(comb, _mm_set1_epi32(0x00000100)), 1),
646            )
647        } else {
648            _mm_or_si128(
649                _mm_or_si128(
650                    _mm_and_si128(comb, _mm_set1_epi32(0x0000007f)),
651                    _mm_srli_epi32(_mm_and_si128(comb, _mm_set1_epi32(0x00030000)), 2),
652                ),
653                _mm_srli_epi32(_mm_and_si128(comb, _mm_set1_epi32(0x00007f00)), 1),
654            )
655        };
656
657        let x: [u32; 4] = core::mem::transmute(x);
658        // _mm_extract_epi32 requires SSE4.1
659        first_num = T::cast_u32(x[0]);
660        second_num = U::cast_u32(x[1]);
661        third_num = V::cast_u32(x[2]);
662        fourth_num = W::cast_u32(x[3]);
663    } else {
664        first_num = T::vector_to_num(core::mem::transmute::<__m128i, [u8; 16]>(first));
665        second_num = U::vector_to_num(core::mem::transmute::<__m128i, [u8; 16]>(second));
666        third_num = V::vector_to_num(core::mem::transmute::<__m128i, [u8; 16]>(third));
667        fourth_num = W::vector_to_num(core::mem::transmute::<__m128i, [u8; 16]>(fourth));
668    }
669
670    (
671        first_num,
672        second_num,
673        third_num,
674        fourth_num,
675        first_len as u8,
676        second_len as u8,
677        third_len as u8,
678        fourth_len as u8,
679        false,
680    )
681}
682
683#[inline]
684#[cfg(any(target_feature = "ssse3", doc))]
685#[cfg_attr(rustc_nightly, doc(cfg(target_feature = "ssse3")))]
686unsafe fn decode_four_u16_unsafe<
687    T: VarIntTarget,
688    U: VarIntTarget,
689    V: VarIntTarget,
690    W: VarIntTarget,
691>(
692    bytes: *const u8,
693) -> (T, U, V, W, u8, u8, u8, u8, bool) {
694    let b = _mm_loadu_si128(bytes as *const __m128i);
695
696    // First find where the boundaries are
697    let bitmask = _mm_movemask_epi8(b) as u32;
698
699    // Use the lookup table
700    let lookup = *lookup::LOOKUP_QUAD_STEP1.get_unchecked((bitmask & 0b111111111111) as usize);
701
702    // Fetch the shuffle mask
703    let shuf = *lookup::LOOKUP_QUAD_VEC.get_unchecked((lookup & 0b11111111) as usize);
704
705    // Extract the lengths while we're waiting
706    let first_len = (lookup >> 8) & 0b1111;
707    let second_len = (lookup >> 12) & 0b1111;
708    let third_len = (lookup >> 16) & 0b1111;
709    let fourth_len = (lookup >> 20) & 0b1111;
710
711    let comb = _mm_shuffle_epi8(b, shuf);
712
713    let invalid = lookup >> 31;
714
715    let first_num;
716    let second_num;
717    let third_num;
718    let fourth_num;
719
720    // PDEP/PEXT may be still a little faster here
721    let should_turbo = cfg!(not(all(target_feature = "bmi2", very_fast_pdep)));
722    if should_turbo {
723        // const, so optimized out
724
725        let x = if T::MAX_VARINT_BYTES <= 2
726            && U::MAX_VARINT_BYTES <= 2
727            && V::MAX_VARINT_BYTES <= 2
728            && W::MAX_VARINT_BYTES <= 2
729        {
730            _mm_or_si128(
731                _mm_and_si128(comb, _mm_set1_epi32(0x0000007f)),
732                _mm_srli_epi32(_mm_and_si128(comb, _mm_set1_epi32(0x00000100)), 1),
733            )
734        } else {
735            _mm_or_si128(
736                _mm_or_si128(
737                    _mm_and_si128(comb, _mm_set1_epi32(0x0000007f)),
738                    _mm_srli_epi32(_mm_and_si128(comb, _mm_set1_epi32(0x00030000)), 2),
739                ),
740                _mm_srli_epi32(_mm_and_si128(comb, _mm_set1_epi32(0x00007f00)), 1),
741            )
742        };
743
744        let x: [u32; 4] = core::mem::transmute(x);
745        // _mm_extract_epi32 requires SSE4.1
746        first_num = T::cast_u32(x[0]);
747        second_num = U::cast_u32(x[1]);
748        third_num = V::cast_u32(x[2]);
749        fourth_num = W::cast_u32(x[3]);
750    } else {
751        first_num = T::vector_to_num(core::mem::transmute::<__m128i, [u8; 16]>(comb));
752        second_num = U::vector_to_num(core::mem::transmute::<__m128i, [u8; 16]>(_mm_bsrli_si128(
753            comb, 4,
754        )));
755        third_num = V::vector_to_num(core::mem::transmute::<__m128i, [u8; 16]>(_mm_bsrli_si128(
756            comb, 8,
757        )));
758        fourth_num = W::vector_to_num(core::mem::transmute::<__m128i, [u8; 16]>(_mm_bsrli_si128(
759            comb, 12,
760        )));
761    }
762
763    (
764        first_num,
765        second_num,
766        third_num,
767        fourth_num,
768        first_len as u8,
769        second_len as u8,
770        third_len as u8,
771        fourth_len as u8,
772        invalid != 0,
773    )
774}
775
776/// Decodes four adjacent varints into u8's simultaneously. Requires SSSE3 support. **Does not
777/// perform overflow checking and may produce incorrect output.**
778///
779/// Returns a tuple containing an array of decoded values, and the total number of bytes read.
780///
781/// # Safety
782/// There must be at least 16 bytes of allocated memory after the start of the pointer. Otherwise,
783/// there may be undefined behavior. Truncated values will be returned if the varint represents
784/// a number larger than what a u8 can handle.
785///
786/// This function does not perform overflow checking. If a varint exceeds two bytes in encoded
787/// length, it may be interpreted as multiple varints, and the reported length of data read will
788/// be shorter than expected. Caution is encouraged when using this function.
789#[inline]
790#[cfg(any(target_feature = "ssse3", doc))]
791#[cfg_attr(rustc_nightly, doc(cfg(target_feature = "ssse3")))]
792pub unsafe fn decode_eight_u8_unsafe(bytes: *const u8) -> ([u8; 8], u8) {
793    let b = _mm_loadu_si128(bytes as *const __m128i);
794
795    let ones = _mm_set1_epi8(1);
796    let mut lens = _mm_setzero_si128();
797    let mut shift = _mm_and_si128(_mm_cmplt_epi8(b, _mm_setzero_si128()), ones);
798    let ascend = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
799    let asc_one = _mm_setr_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
800    let mut window_small = _mm_setr_epi8(1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
801
802    let broadcast_mask = _mm_setzero_si128();
803
804    // if the first byte is zero, shift down by 1, if the first byte is one, shift down by 2
805    // 0
806    let first_byte = _mm_shuffle_epi8(shift, broadcast_mask);
807    shift = _mm_shuffle_epi8(shift, _mm_add_epi8(asc_one, first_byte));
808    lens = _mm_or_si128(lens, _mm_and_si128(first_byte, window_small));
809    window_small = _mm_bslli_si128(window_small, 1);
810
811    // 1
812    let first_byte = _mm_shuffle_epi8(shift, broadcast_mask);
813    shift = _mm_shuffle_epi8(shift, _mm_add_epi8(asc_one, first_byte));
814    lens = _mm_or_si128(lens, _mm_and_si128(first_byte, window_small));
815    window_small = _mm_bslli_si128(window_small, 1);
816
817    // 2
818    let first_byte = _mm_shuffle_epi8(shift, broadcast_mask);
819    shift = _mm_shuffle_epi8(shift, _mm_add_epi8(asc_one, first_byte));
820    lens = _mm_or_si128(lens, _mm_and_si128(first_byte, window_small));
821    window_small = _mm_bslli_si128(window_small, 1);
822
823    // 3
824    let first_byte = _mm_shuffle_epi8(shift, broadcast_mask);
825    shift = _mm_shuffle_epi8(shift, _mm_add_epi8(asc_one, first_byte));
826    lens = _mm_or_si128(lens, _mm_and_si128(first_byte, window_small));
827    window_small = _mm_bslli_si128(window_small, 1);
828
829    // 4
830    let first_byte = _mm_shuffle_epi8(shift, broadcast_mask);
831    shift = _mm_shuffle_epi8(shift, _mm_add_epi8(asc_one, first_byte));
832    lens = _mm_or_si128(lens, _mm_and_si128(first_byte, window_small));
833    window_small = _mm_bslli_si128(window_small, 1);
834
835    // 5
836    let first_byte = _mm_shuffle_epi8(shift, broadcast_mask);
837    shift = _mm_shuffle_epi8(shift, _mm_add_epi8(asc_one, first_byte));
838    lens = _mm_or_si128(lens, _mm_and_si128(first_byte, window_small));
839    window_small = _mm_bslli_si128(window_small, 1);
840
841    // 6
842    let first_byte = _mm_shuffle_epi8(shift, broadcast_mask);
843    shift = _mm_shuffle_epi8(shift, _mm_add_epi8(asc_one, first_byte));
844    lens = _mm_or_si128(lens, _mm_and_si128(first_byte, window_small));
845    window_small = _mm_bslli_si128(window_small, 1);
846
847    // 7
848    let first_byte = _mm_shuffle_epi8(shift, broadcast_mask);
849    // shift = _mm_shuffle_epi8(shift, _mm_add_epi8(asc_one, first_byte));
850    lens = _mm_or_si128(lens, _mm_and_si128(first_byte, window_small));
851    // window_small = _mm_bslli_si128(window_small, 1);
852
853    // Construct the shuffle
854
855    let lens_invert = _mm_sub_epi8(ones, lens);
856    let mut cumul_lens = _mm_add_epi8(lens_invert, _mm_bslli_si128(lens_invert, 1));
857    cumul_lens = _mm_add_epi8(cumul_lens, _mm_bslli_si128(cumul_lens, 2));
858    cumul_lens = _mm_add_epi8(cumul_lens, _mm_bslli_si128(cumul_lens, 4));
859    cumul_lens = _mm_add_epi8(cumul_lens, _mm_bslli_si128(cumul_lens, 8));
860
861    let cumul_lens_2: [u8; 16] = core::mem::transmute(cumul_lens);
862    let last_len = 8 - cumul_lens_2[7] + 8;
863
864    // Set one-lengthed second bytes to negative
865    let second = _mm_shuffle_epi8(
866        _mm_add_epi8(lens, ones),
867        _mm_setr_epi8(-1, 0, -1, 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7),
868    );
869
870    let shuf_pt1 = _mm_or_si128(ascend, _mm_cmpeq_epi8(second, ones));
871
872    // Subtract the cumulative sum of zero-lengths to adjust the indexes
873    let x_shuf = _mm_shuffle_epi8(
874        _mm_bslli_si128(cumul_lens, 1),
875        _mm_setr_epi8(0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7),
876    );
877
878    let shuf = _mm_sub_epi8(shuf_pt1, x_shuf);
879    let comb = _mm_shuffle_epi8(b, shuf);
880
881    let x = _mm_or_si128(
882        _mm_and_si128(comb, _mm_set1_epi16(0x0000007f)),
883        _mm_srli_epi16(_mm_and_si128(comb, _mm_set1_epi16(0x00000100)), 1),
884    );
885
886    let shuf = _mm_shuffle_epi8(
887        x,
888        _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, -1, -1, -1, -1, -1, -1, -1, -1),
889    );
890    let lower: [u64; 2] = core::mem::transmute(shuf);
891    let nums = lower[0].to_ne_bytes();
892
893    (nums, last_len)
894}