1#[cfg(target_arch = "x86")]
2use core::arch::x86::*;
3#[cfg(target_arch = "x86_64")]
4use core::arch::x86_64::*;
5use core::cmp::min;
6
7use crate::num::{SignedVarIntTarget, VarIntTarget};
8use crate::VarIntDecodeError;
9
10mod lookup;
11
12#[inline]
28pub fn decode<T: VarIntTarget>(bytes: &[u8]) -> Result<(T, usize), VarIntDecodeError> {
29 let result = if bytes.len() >= 16 {
30 unsafe { decode_unsafe(bytes.as_ptr()) }
31 } else if !bytes.is_empty() {
32 let mut data = [0u8; 16];
33 let len = min(16, bytes.len());
34 data[..len].copy_from_slice(&bytes[..len]);
36 unsafe { decode_unsafe(data.as_ptr()) }
37 } else {
38 return Err(VarIntDecodeError::NotEnoughBytes);
39 };
40
41 if bytes.len() >= T::MAX_VARINT_BYTES as usize
43 && unsafe { *bytes.get_unchecked((T::MAX_VARINT_BYTES - 1) as usize) } > T::MAX_LAST_VARINT_BYTE
45 && result.1 == T::MAX_VARINT_BYTES as usize
46 || result.1 > T::MAX_VARINT_BYTES as usize
47 {
48 Err(VarIntDecodeError::Overflow)
49 } else if result.1 > bytes.len() {
50 Err(VarIntDecodeError::NotEnoughBytes)
51 } else {
52 Ok(result)
53 }
54}
55
56#[inline]
69pub fn decode_len<T: VarIntTarget>(bytes: &[u8]) -> Result<usize, VarIntDecodeError> {
70 let result = if bytes.len() >= 16 {
71 unsafe { decode_len_unsafe::<T>(bytes.as_ptr()) }
72 } else if !bytes.is_empty() {
73 let mut data = [0u8; 16];
74 let len = min(16, bytes.len());
75 data[..len].copy_from_slice(&bytes[..len]);
77 unsafe { decode_len_unsafe::<T>(data.as_ptr()) }
78 } else {
79 return Err(VarIntDecodeError::NotEnoughBytes);
80 };
81
82 Ok(result)
83}
84
85#[inline]
99pub fn decode_zigzag<T: SignedVarIntTarget>(bytes: &[u8]) -> Result<(T, usize), VarIntDecodeError> {
100 decode::<T::Unsigned>(bytes).map(|r| (r.0.unzigzag(), r.1))
101}
102
103#[inline]
108pub unsafe fn decode_len_unsafe<T: VarIntTarget>(bytes: *const u8) -> usize {
109 if T::MAX_VARINT_BYTES <= 5 {
110 let b = bytes.cast::<u64>().read_unaligned();
111 let msbs = !b & !0x7f7f7f7f7f7f7f7f;
112 let len = msbs.trailing_zeros() + 1; (len / 8) as usize
114 } else {
115 let b0 = bytes.cast::<u64>().read_unaligned();
116 let b1 = bytes.cast::<u64>().add(1).read_unaligned();
117
118 let msbs0 = !b0 & !0x7f7f7f7f7f7f7f7f;
119 let msbs1 = !b1 & !0x7f7f7f7f7f7f7f7f;
120
121 let len0 = msbs0.trailing_zeros() + 1;
122 let len1 = msbs1.trailing_zeros() + 1;
123
124 let len = if msbs0 == 0 { len1 + 64 } else { len0 };
125 len as usize / 8
126 }
127}
128
129#[inline]
141pub unsafe fn decode_unsafe<T: VarIntTarget>(bytes: *const u8) -> (T, usize) {
142 if T::MAX_VARINT_BYTES <= 5 {
146 let b = bytes.cast::<u64>().read_unaligned();
148 let msbs = !b & !0x7f7f7f7f7f7f7f7f;
152 let len = msbs.trailing_zeros() + 1; let varint_part = b & (msbs ^ msbs.wrapping_sub(1));
163 let num = T::scalar_to_num(varint_part);
166
167 (num, (len / 8) as usize)
168 } else {
169 let b0 = bytes.cast::<u64>().read_unaligned();
170 let b1 = bytes.cast::<u64>().add(1).read_unaligned();
171
172 let msbs0 = !b0 & !0x7f7f7f7f7f7f7f7f;
173 let msbs1 = !b1 & !0x7f7f7f7f7f7f7f7f;
174
175 let len0 = msbs0.trailing_zeros() + 1;
183 let len1 = msbs1.trailing_zeros() + 1;
184
185 let varint_part0 = b0 & (msbs0 ^ msbs0.wrapping_sub(1));
189 let varint_part1 = (b1 & (msbs1 ^ msbs1.wrapping_sub(1))) * ((msbs0 == 0) as u64);
191
192 let num = T::vector_to_num(core::mem::transmute::<[u64; 2], [u8; 16]>([
196 varint_part0,
197 varint_part1,
198 ]));
199 let len = if msbs0 == 0 { len1 + 64 } else { len0 } / 8;
200
201 (num, len as usize)
202 }
203}
204
205#[inline]
221#[cfg(any(target_feature = "ssse3", doc))]
222#[cfg_attr(rustc_nightly, doc(cfg(target_feature = "ssse3")))]
223pub unsafe fn decode_two_unsafe<T: VarIntTarget, U: VarIntTarget>(
224 bytes: *const u8,
225) -> (T, U, u8, u8) {
226 if T::MAX_VARINT_BYTES + U::MAX_VARINT_BYTES > 16 {
227 panic!(
229 "exceeded length limit: cannot decode {} and {}, total length {} exceeds 16 bytes",
230 core::any::type_name::<T>(),
231 core::any::type_name::<U>(),
232 T::MAX_VARINT_BYTES + U::MAX_VARINT_BYTES
233 );
234 }
235
236 if T::MAX_VARINT_BYTES <= 5 && U::MAX_VARINT_BYTES <= 5 {
237 return decode_two_u32_unsafe(bytes);
239 }
240
241 let b = _mm_loadu_si128(bytes as *const __m128i);
242
243 let bitmask = _mm_movemask_epi8(b) as u32;
245
246 let bm_not = !bitmask;
248 let first_len = bm_not.trailing_zeros() + 1; let bm_not_2 = bm_not >> first_len;
250 let second_len = bm_not_2.trailing_zeros() + 1;
251
252 let ascend = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
253
254 let first_len_vec = _mm_set1_epi8(first_len as i8);
255 let first_mask = _mm_cmplt_epi8(ascend, first_len_vec);
256 let first = _mm_and_si128(b, first_mask);
257
258 let second_shuf = _mm_add_epi8(ascend, first_len_vec);
259 let second_shuffled = _mm_shuffle_epi8(b, second_shuf);
260 let second_mask = _mm_cmplt_epi8(ascend, _mm_set1_epi8(second_len as i8));
261 let second = _mm_and_si128(second_shuffled, second_mask);
262
263 let first_num;
264 let second_num;
265
266 let should_turbo = T::MAX_VARINT_BYTES <= 8
268 && U::MAX_VARINT_BYTES <= 8
269 && cfg!(not(all(target_feature = "bmi2", very_fast_pdep)));
270 if should_turbo {
271 let comb = _mm_or_si128(first, _mm_bslli_si128(second, 8));
273
274 let x = if T::MAX_VARINT_BYTES <= 2 && U::MAX_VARINT_BYTES <= 2 {
275 dual_u8_stage2(comb)
276 } else if T::MAX_VARINT_BYTES <= 3 && U::MAX_VARINT_BYTES <= 3 {
277 dual_u16_stage2(comb)
278 } else {
279 dual_u32_stage2(comb)
280 };
281
282 let x: [u32; 4] = core::mem::transmute(x);
283 first_num = T::cast_u32(x[0]);
285 second_num = U::cast_u32(x[2]);
286 } else {
287 first_num = T::vector_to_num(core::mem::transmute::<__m128i, [u8; 16]>(first));
288 second_num = U::vector_to_num(core::mem::transmute::<__m128i, [u8; 16]>(second));
289 }
290
291 (first_num, second_num, first_len as u8, second_len as u8)
292}
293
294#[inline]
295#[cfg(any(target_feature = "ssse3", doc))]
296unsafe fn decode_two_u32_unsafe<T: VarIntTarget, U: VarIntTarget>(
297 bytes: *const u8,
298) -> (T, U, u8, u8) {
299 let b = _mm_loadu_si128(bytes as *const __m128i);
300
301 let bitmask = _mm_movemask_epi8(b) as u32 & 0b1111111111;
303
304 let (lookup, first_len, second_len) =
306 *lookup::LOOKUP_DOUBLE_STEP1.get_unchecked(bitmask as usize);
307 let shuf = *lookup::LOOKUP_DOUBLE_VEC.get_unchecked(lookup as usize);
308
309 let comb = _mm_shuffle_epi8(b, shuf);
310
311 let first_num;
312 let second_num;
313
314 let should_turbo = cfg!(not(all(target_feature = "bmi2", very_fast_pdep)));
316 if should_turbo {
317 let x = if T::MAX_VARINT_BYTES <= 2 && U::MAX_VARINT_BYTES <= 2 {
320 dual_u8_stage2(comb)
321 } else if T::MAX_VARINT_BYTES <= 3 && U::MAX_VARINT_BYTES <= 3 {
322 dual_u16_stage2(comb)
323 } else {
324 dual_u32_stage2(comb)
325 };
326
327 let x: [u32; 4] = core::mem::transmute(x);
328 first_num = T::cast_u32(x[0]);
330 second_num = U::cast_u32(x[2]);
331 } else {
332 first_num = T::vector_to_num(core::mem::transmute::<__m128i, [u8; 16]>(comb));
333 second_num = U::vector_to_num(core::mem::transmute::<__m128i, [u8; 16]>(_mm_bsrli_si128(
334 comb, 8,
335 )));
336 }
337
338 (first_num, second_num, first_len, second_len)
339}
340
341#[inline(always)]
342unsafe fn dual_u8_stage2(comb: __m128i) -> __m128i {
343 _mm_or_si128(
344 _mm_and_si128(comb, _mm_set_epi64x(0x000000000000007f, 0x000000000000007f)),
345 _mm_srli_epi64(
346 _mm_and_si128(comb, _mm_set_epi64x(0x0000000000000100, 0x0000000000000100)),
347 1,
348 ),
349 )
350}
351
352#[inline(always)]
353unsafe fn dual_u16_stage2(comb: __m128i) -> __m128i {
354 _mm_or_si128(
355 _mm_or_si128(
356 _mm_and_si128(comb, _mm_set_epi64x(0x000000000000007f, 0x000000000000007f)),
357 _mm_srli_epi64(
358 _mm_and_si128(comb, _mm_set_epi64x(0x0000000000030000, 0x0000000000030000)),
359 2,
360 ),
361 ),
362 _mm_srli_epi64(
363 _mm_and_si128(comb, _mm_set_epi64x(0x0000000000007f00, 0x0000000000007f00)),
364 1,
365 ),
366 )
367}
368
369#[inline(always)]
370unsafe fn dual_u32_stage2(comb: __m128i) -> __m128i {
371 _mm_or_si128(
372 _mm_or_si128(
373 _mm_and_si128(comb, _mm_set_epi64x(0x000000000000007f, 0x000000000000007f)),
374 _mm_srli_epi64(
375 _mm_and_si128(comb, _mm_set_epi64x(0x0000000f00000000, 0x0000000f00000000)),
376 4,
377 ),
378 ),
379 _mm_or_si128(
380 _mm_or_si128(
381 _mm_srli_epi64(
382 _mm_and_si128(comb, _mm_set_epi64x(0x000000007f000000, 0x000000007f000000)),
383 3,
384 ),
385 _mm_srli_epi64(
386 _mm_and_si128(comb, _mm_set_epi64x(0x00000000007f0000, 0x00000000007f0000)),
387 2,
388 ),
389 ),
390 _mm_srli_epi64(
391 _mm_and_si128(comb, _mm_set_epi64x(0x0000000000007f00, 0x0000000000007f00)),
392 1,
393 ),
394 ),
395 )
396}
397
398#[inline]
410#[cfg(any(target_feature = "avx2", doc))]
411#[cfg_attr(rustc_nightly, doc(cfg(target_feature = "avx2")))]
412pub unsafe fn decode_two_wide_unsafe<T: VarIntTarget, U: VarIntTarget>(
413 bytes: *const u8,
414) -> (T, U, u8, u8) {
415 let b = _mm256_loadu_si256(bytes as *const __m256i);
416
417 let bitmask = _mm256_movemask_epi8(b) as u32;
419
420 let bm_not = !bitmask;
422 let first_len = bm_not.trailing_zeros() + 1; let bm_not_2 = bm_not >> first_len;
424 let second_len = bm_not_2.trailing_zeros() + 1;
425
426 let ascend = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
428 let first_mask = _mm_cmplt_epi8(ascend, _mm_set1_epi8(first_len as i8));
429 let first = _mm_and_si128(_mm256_extracti128_si256(b, 0), first_mask);
430
431 let shuf_gen = _mm256_setr_epi8(
433 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
434 12, 13, 14, 15,
435 );
436
437 let shuf_add = _mm256_set_m128i(
439 _mm_set1_epi8(-(16i8 - first_len as i8)),
440 _mm_set1_epi8(first_len as i8),
441 );
442 let shuf_added = _mm256_add_epi8(shuf_gen, shuf_add);
443 let shuf = _mm256_or_si256(
444 shuf_added,
445 _mm256_cmpgt_epi8(shuf_added, _mm256_set1_epi8(15)), );
447 let shuffled = _mm256_shuffle_epi8(b, shuf);
448
449 let second_shifted = _mm_or_si128(
451 _mm256_extracti128_si256(shuffled, 0),
452 _mm256_extracti128_si256(shuffled, 1),
453 );
454 let second_mask = _mm_cmplt_epi8(ascend, _mm_set1_epi8(second_len as i8));
455 let second = _mm_and_si128(second_shifted, second_mask);
456
457 let first_num;
458 let second_num;
459
460 let should_turbo = true;
462 if should_turbo {
463 let comb_lo = _mm_unpacklo_epi64(first, second);
465 let x_lo = _mm_or_si128(
466 _mm_or_si128(
467 _mm_or_si128(
468 _mm_and_si128(comb_lo, _mm_set1_epi64x(0x000000000000007f)),
469 _mm_srli_epi64(
470 _mm_and_si128(comb_lo, _mm_set1_epi64x(0x7f00000000000000)),
471 7,
472 ),
473 ),
474 _mm_or_si128(
475 _mm_srli_epi64(
476 _mm_and_si128(comb_lo, _mm_set1_epi64x(0x007f000000000000)),
477 6,
478 ),
479 _mm_srli_epi64(
480 _mm_and_si128(comb_lo, _mm_set1_epi64x(0x00007f0000000000)),
481 5,
482 ),
483 ),
484 ),
485 _mm_or_si128(
486 _mm_or_si128(
487 _mm_srli_epi64(
488 _mm_and_si128(comb_lo, _mm_set1_epi64x(0x0000007f00000000)),
489 4,
490 ),
491 _mm_srli_epi64(
492 _mm_and_si128(comb_lo, _mm_set1_epi64x(0x000000007f000000)),
493 3,
494 ),
495 ),
496 _mm_or_si128(
497 _mm_srli_epi64(
498 _mm_and_si128(comb_lo, _mm_set1_epi64x(0x00000000007f0000)),
499 2,
500 ),
501 _mm_srli_epi64(
502 _mm_and_si128(comb_lo, _mm_set1_epi64x(0x0000000000007f00)),
503 1,
504 ),
505 ),
506 ),
507 );
508
509 let comb_hi = _mm_unpackhi_epi64(first, second);
510 let x_hi = _mm_or_si128(
511 _mm_slli_epi64(
512 _mm_and_si128(comb_hi, _mm_set1_epi64x(0x0000000000000100)),
513 55,
514 ),
515 _mm_slli_epi64(
516 _mm_and_si128(comb_hi, _mm_set1_epi64x(0x000000000000007f)),
517 56,
518 ),
519 );
520
521 let x = _mm_or_si128(x_lo, x_hi);
522
523 first_num = T::cast_u64(_mm_extract_epi64(x, 0) as u64);
524 second_num = U::cast_u64(_mm_extract_epi64(x, 1) as u64);
525 } else {
526 first_num = T::vector_to_num(core::mem::transmute::<__m128i, [u8; 16]>(first));
527 second_num = U::vector_to_num(core::mem::transmute::<__m128i, [u8; 16]>(second));
528 }
529
530 (first_num, second_num, first_len as u8, second_len as u8)
531}
532
533#[inline]
547#[cfg(any(target_feature = "ssse3", doc))]
548#[cfg_attr(rustc_nightly, doc(cfg(target_feature = "ssse3")))]
549pub unsafe fn decode_four_unsafe<
550 T: VarIntTarget,
551 U: VarIntTarget,
552 V: VarIntTarget,
553 W: VarIntTarget,
554>(
555 bytes: *const u8,
556) -> (T, U, V, W, u8, u8, u8, u8, bool) {
557 if T::MAX_VARINT_BYTES + U::MAX_VARINT_BYTES + V::MAX_VARINT_BYTES + W::MAX_VARINT_BYTES > 16 {
558 panic!(
560 "exceeded length limit: cannot decode {}, {}, {}, and {}, total length {} exceeds 16 bytes",
561 core::any::type_name::<T>(),
562 core::any::type_name::<U>(),
563 core::any::type_name::<V>(),
564 core::any::type_name::<W>(),
565 T::MAX_VARINT_BYTES + U::MAX_VARINT_BYTES + V::MAX_VARINT_BYTES + W::MAX_VARINT_BYTES
566 );
567 }
568
569 if T::MAX_VARINT_BYTES <= 3
570 && U::MAX_VARINT_BYTES <= 3
571 && V::MAX_VARINT_BYTES <= 3
572 && W::MAX_VARINT_BYTES <= 3
573 {
574 return decode_four_u16_unsafe(bytes);
575 }
576
577 let b = _mm_loadu_si128(bytes as *const __m128i);
578
579 let bitmask = _mm_movemask_epi8(b) as u32;
581
582 let bm_not = !bitmask;
584 let first_len = bm_not.trailing_zeros() + 1; let bm_not_2 = bm_not >> first_len;
586 let second_len = bm_not_2.trailing_zeros() + 1;
587 let bm_not_3 = bm_not_2 >> second_len;
588 let third_len = bm_not_3.trailing_zeros() + 1;
589 let bm_not_4 = bm_not_3 >> third_len;
590 let fourth_len = bm_not_4.trailing_zeros() + 1;
591
592 let ascend = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
593
594 let first_len_vec = _mm_set1_epi8(first_len as i8);
595 let first_mask = _mm_cmplt_epi8(ascend, first_len_vec);
596 let first = _mm_and_si128(b, first_mask);
597
598 let second_shuf = _mm_add_epi8(ascend, first_len_vec);
599 let second_shuffled = _mm_shuffle_epi8(b, second_shuf);
600 let second_len_vec = _mm_set1_epi8(second_len as i8);
601 let second_mask = _mm_cmplt_epi8(ascend, second_len_vec);
602 let second = _mm_and_si128(second_shuffled, second_mask);
603
604 let third_shuf = _mm_add_epi8(ascend, second_len_vec);
605 let third_shuffled = _mm_shuffle_epi8(second_shuffled, third_shuf);
606 let third_len_vec = _mm_set1_epi8(third_len as i8);
607 let third_mask = _mm_cmplt_epi8(ascend, third_len_vec);
608 let third = _mm_and_si128(third_shuffled, third_mask);
609
610 let fourth_shuf = _mm_add_epi8(ascend, third_len_vec);
611 let fourth_shuffled = _mm_shuffle_epi8(third_shuffled, fourth_shuf);
612 let fourth_len_vec = _mm_set1_epi8(fourth_len as i8);
613 let fourth_mask = _mm_cmplt_epi8(ascend, fourth_len_vec);
614 let fourth = _mm_and_si128(fourth_shuffled, fourth_mask);
615
616 let first_num;
617 let second_num;
618 let third_num;
619 let fourth_num;
620
621 let should_turbo = T::MAX_VARINT_BYTES <= 4
623 && U::MAX_VARINT_BYTES <= 4
624 && V::MAX_VARINT_BYTES <= 4
625 && W::MAX_VARINT_BYTES <= 4
626 && cfg!(not(all(
628 target_feature = "bmi2",
629 very_fast_pdep
630 )));
631 if should_turbo {
632 let comb = _mm_or_si128(
634 _mm_or_si128(first, _mm_bslli_si128(second, 4)),
635 _mm_or_si128(_mm_bslli_si128(third, 8), _mm_bslli_si128(fourth, 12)),
636 );
637
638 let x = if T::MAX_VARINT_BYTES <= 2
639 && U::MAX_VARINT_BYTES <= 2
640 && V::MAX_VARINT_BYTES <= 2
641 && W::MAX_VARINT_BYTES <= 2
642 {
643 _mm_or_si128(
644 _mm_and_si128(comb, _mm_set1_epi32(0x0000007f)),
645 _mm_srli_epi32(_mm_and_si128(comb, _mm_set1_epi32(0x00000100)), 1),
646 )
647 } else {
648 _mm_or_si128(
649 _mm_or_si128(
650 _mm_and_si128(comb, _mm_set1_epi32(0x0000007f)),
651 _mm_srli_epi32(_mm_and_si128(comb, _mm_set1_epi32(0x00030000)), 2),
652 ),
653 _mm_srli_epi32(_mm_and_si128(comb, _mm_set1_epi32(0x00007f00)), 1),
654 )
655 };
656
657 let x: [u32; 4] = core::mem::transmute(x);
658 first_num = T::cast_u32(x[0]);
660 second_num = U::cast_u32(x[1]);
661 third_num = V::cast_u32(x[2]);
662 fourth_num = W::cast_u32(x[3]);
663 } else {
664 first_num = T::vector_to_num(core::mem::transmute::<__m128i, [u8; 16]>(first));
665 second_num = U::vector_to_num(core::mem::transmute::<__m128i, [u8; 16]>(second));
666 third_num = V::vector_to_num(core::mem::transmute::<__m128i, [u8; 16]>(third));
667 fourth_num = W::vector_to_num(core::mem::transmute::<__m128i, [u8; 16]>(fourth));
668 }
669
670 (
671 first_num,
672 second_num,
673 third_num,
674 fourth_num,
675 first_len as u8,
676 second_len as u8,
677 third_len as u8,
678 fourth_len as u8,
679 false,
680 )
681}
682
683#[inline]
684#[cfg(any(target_feature = "ssse3", doc))]
685#[cfg_attr(rustc_nightly, doc(cfg(target_feature = "ssse3")))]
686unsafe fn decode_four_u16_unsafe<
687 T: VarIntTarget,
688 U: VarIntTarget,
689 V: VarIntTarget,
690 W: VarIntTarget,
691>(
692 bytes: *const u8,
693) -> (T, U, V, W, u8, u8, u8, u8, bool) {
694 let b = _mm_loadu_si128(bytes as *const __m128i);
695
696 let bitmask = _mm_movemask_epi8(b) as u32;
698
699 let lookup = *lookup::LOOKUP_QUAD_STEP1.get_unchecked((bitmask & 0b111111111111) as usize);
701
702 let shuf = *lookup::LOOKUP_QUAD_VEC.get_unchecked((lookup & 0b11111111) as usize);
704
705 let first_len = (lookup >> 8) & 0b1111;
707 let second_len = (lookup >> 12) & 0b1111;
708 let third_len = (lookup >> 16) & 0b1111;
709 let fourth_len = (lookup >> 20) & 0b1111;
710
711 let comb = _mm_shuffle_epi8(b, shuf);
712
713 let invalid = lookup >> 31;
714
715 let first_num;
716 let second_num;
717 let third_num;
718 let fourth_num;
719
720 let should_turbo = cfg!(not(all(target_feature = "bmi2", very_fast_pdep)));
722 if should_turbo {
723 let x = if T::MAX_VARINT_BYTES <= 2
726 && U::MAX_VARINT_BYTES <= 2
727 && V::MAX_VARINT_BYTES <= 2
728 && W::MAX_VARINT_BYTES <= 2
729 {
730 _mm_or_si128(
731 _mm_and_si128(comb, _mm_set1_epi32(0x0000007f)),
732 _mm_srli_epi32(_mm_and_si128(comb, _mm_set1_epi32(0x00000100)), 1),
733 )
734 } else {
735 _mm_or_si128(
736 _mm_or_si128(
737 _mm_and_si128(comb, _mm_set1_epi32(0x0000007f)),
738 _mm_srli_epi32(_mm_and_si128(comb, _mm_set1_epi32(0x00030000)), 2),
739 ),
740 _mm_srli_epi32(_mm_and_si128(comb, _mm_set1_epi32(0x00007f00)), 1),
741 )
742 };
743
744 let x: [u32; 4] = core::mem::transmute(x);
745 first_num = T::cast_u32(x[0]);
747 second_num = U::cast_u32(x[1]);
748 third_num = V::cast_u32(x[2]);
749 fourth_num = W::cast_u32(x[3]);
750 } else {
751 first_num = T::vector_to_num(core::mem::transmute::<__m128i, [u8; 16]>(comb));
752 second_num = U::vector_to_num(core::mem::transmute::<__m128i, [u8; 16]>(_mm_bsrli_si128(
753 comb, 4,
754 )));
755 third_num = V::vector_to_num(core::mem::transmute::<__m128i, [u8; 16]>(_mm_bsrli_si128(
756 comb, 8,
757 )));
758 fourth_num = W::vector_to_num(core::mem::transmute::<__m128i, [u8; 16]>(_mm_bsrli_si128(
759 comb, 12,
760 )));
761 }
762
763 (
764 first_num,
765 second_num,
766 third_num,
767 fourth_num,
768 first_len as u8,
769 second_len as u8,
770 third_len as u8,
771 fourth_len as u8,
772 invalid != 0,
773 )
774}
775
776#[inline]
790#[cfg(any(target_feature = "ssse3", doc))]
791#[cfg_attr(rustc_nightly, doc(cfg(target_feature = "ssse3")))]
792pub unsafe fn decode_eight_u8_unsafe(bytes: *const u8) -> ([u8; 8], u8) {
793 let b = _mm_loadu_si128(bytes as *const __m128i);
794
795 let ones = _mm_set1_epi8(1);
796 let mut lens = _mm_setzero_si128();
797 let mut shift = _mm_and_si128(_mm_cmplt_epi8(b, _mm_setzero_si128()), ones);
798 let ascend = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
799 let asc_one = _mm_setr_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
800 let mut window_small = _mm_setr_epi8(1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
801
802 let broadcast_mask = _mm_setzero_si128();
803
804 let first_byte = _mm_shuffle_epi8(shift, broadcast_mask);
807 shift = _mm_shuffle_epi8(shift, _mm_add_epi8(asc_one, first_byte));
808 lens = _mm_or_si128(lens, _mm_and_si128(first_byte, window_small));
809 window_small = _mm_bslli_si128(window_small, 1);
810
811 let first_byte = _mm_shuffle_epi8(shift, broadcast_mask);
813 shift = _mm_shuffle_epi8(shift, _mm_add_epi8(asc_one, first_byte));
814 lens = _mm_or_si128(lens, _mm_and_si128(first_byte, window_small));
815 window_small = _mm_bslli_si128(window_small, 1);
816
817 let first_byte = _mm_shuffle_epi8(shift, broadcast_mask);
819 shift = _mm_shuffle_epi8(shift, _mm_add_epi8(asc_one, first_byte));
820 lens = _mm_or_si128(lens, _mm_and_si128(first_byte, window_small));
821 window_small = _mm_bslli_si128(window_small, 1);
822
823 let first_byte = _mm_shuffle_epi8(shift, broadcast_mask);
825 shift = _mm_shuffle_epi8(shift, _mm_add_epi8(asc_one, first_byte));
826 lens = _mm_or_si128(lens, _mm_and_si128(first_byte, window_small));
827 window_small = _mm_bslli_si128(window_small, 1);
828
829 let first_byte = _mm_shuffle_epi8(shift, broadcast_mask);
831 shift = _mm_shuffle_epi8(shift, _mm_add_epi8(asc_one, first_byte));
832 lens = _mm_or_si128(lens, _mm_and_si128(first_byte, window_small));
833 window_small = _mm_bslli_si128(window_small, 1);
834
835 let first_byte = _mm_shuffle_epi8(shift, broadcast_mask);
837 shift = _mm_shuffle_epi8(shift, _mm_add_epi8(asc_one, first_byte));
838 lens = _mm_or_si128(lens, _mm_and_si128(first_byte, window_small));
839 window_small = _mm_bslli_si128(window_small, 1);
840
841 let first_byte = _mm_shuffle_epi8(shift, broadcast_mask);
843 shift = _mm_shuffle_epi8(shift, _mm_add_epi8(asc_one, first_byte));
844 lens = _mm_or_si128(lens, _mm_and_si128(first_byte, window_small));
845 window_small = _mm_bslli_si128(window_small, 1);
846
847 let first_byte = _mm_shuffle_epi8(shift, broadcast_mask);
849 lens = _mm_or_si128(lens, _mm_and_si128(first_byte, window_small));
851 let lens_invert = _mm_sub_epi8(ones, lens);
856 let mut cumul_lens = _mm_add_epi8(lens_invert, _mm_bslli_si128(lens_invert, 1));
857 cumul_lens = _mm_add_epi8(cumul_lens, _mm_bslli_si128(cumul_lens, 2));
858 cumul_lens = _mm_add_epi8(cumul_lens, _mm_bslli_si128(cumul_lens, 4));
859 cumul_lens = _mm_add_epi8(cumul_lens, _mm_bslli_si128(cumul_lens, 8));
860
861 let cumul_lens_2: [u8; 16] = core::mem::transmute(cumul_lens);
862 let last_len = 8 - cumul_lens_2[7] + 8;
863
864 let second = _mm_shuffle_epi8(
866 _mm_add_epi8(lens, ones),
867 _mm_setr_epi8(-1, 0, -1, 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7),
868 );
869
870 let shuf_pt1 = _mm_or_si128(ascend, _mm_cmpeq_epi8(second, ones));
871
872 let x_shuf = _mm_shuffle_epi8(
874 _mm_bslli_si128(cumul_lens, 1),
875 _mm_setr_epi8(0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7),
876 );
877
878 let shuf = _mm_sub_epi8(shuf_pt1, x_shuf);
879 let comb = _mm_shuffle_epi8(b, shuf);
880
881 let x = _mm_or_si128(
882 _mm_and_si128(comb, _mm_set1_epi16(0x0000007f)),
883 _mm_srli_epi16(_mm_and_si128(comb, _mm_set1_epi16(0x00000100)), 1),
884 );
885
886 let shuf = _mm_shuffle_epi8(
887 x,
888 _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, -1, -1, -1, -1, -1, -1, -1, -1),
889 );
890 let lower: [u64; 2] = core::mem::transmute(shuf);
891 let nums = lower[0].to_ne_bytes();
892
893 (nums, last_len)
894}