use std::arch::x86_64::*;
use std::mem;
use packed::pattern::{PatternID, Patterns};
use packed::teddy::compile;
use packed::vector::*;
use Match;
#[derive(Clone, Debug)]
pub struct Teddy {
pub buckets: Vec<Vec<PatternID>>,
pub max_pattern_id: PatternID,
pub exec: Exec,
}
impl Teddy {
pub fn find_at(
&self,
pats: &Patterns,
haystack: &[u8],
at: usize,
) -> Option<Match> {
assert_eq!(
self.max_pattern_id,
pats.max_pattern_id(),
"teddy must be called with same patterns it was built with",
);
assert!(haystack[at..].len() >= self.minimum_len());
unsafe {
match self.exec {
Exec::TeddySlim1Mask128(ref e) => {
e.find_at(pats, self, haystack, at)
}
Exec::TeddySlim1Mask256(ref e) => {
e.find_at(pats, self, haystack, at)
}
Exec::TeddyFat1Mask256(ref e) => {
e.find_at(pats, self, haystack, at)
}
Exec::TeddySlim2Mask128(ref e) => {
e.find_at(pats, self, haystack, at)
}
Exec::TeddySlim2Mask256(ref e) => {
e.find_at(pats, self, haystack, at)
}
Exec::TeddyFat2Mask256(ref e) => {
e.find_at(pats, self, haystack, at)
}
Exec::TeddySlim3Mask128(ref e) => {
e.find_at(pats, self, haystack, at)
}
Exec::TeddySlim3Mask256(ref e) => {
e.find_at(pats, self, haystack, at)
}
Exec::TeddyFat3Mask256(ref e) => {
e.find_at(pats, self, haystack, at)
}
}
}
}
pub fn minimum_len(&self) -> usize {
match self.exec {
Exec::TeddySlim1Mask128(_) => 16,
Exec::TeddySlim1Mask256(_) => 32,
Exec::TeddyFat1Mask256(_) => 16,
Exec::TeddySlim2Mask128(_) => 17,
Exec::TeddySlim2Mask256(_) => 33,
Exec::TeddyFat2Mask256(_) => 17,
Exec::TeddySlim3Mask128(_) => 18,
Exec::TeddySlim3Mask256(_) => 34,
Exec::TeddyFat3Mask256(_) => 34,
}
}
pub fn heap_bytes(&self) -> usize {
let num_patterns = self.max_pattern_id as usize + 1;
self.buckets.len() * mem::size_of::<Vec<PatternID>>()
+ num_patterns * mem::size_of::<PatternID>()
}
#[inline(always)]
unsafe fn verify128(
&self,
pats: &Patterns,
haystack: &[u8],
at: usize,
cand: __m128i,
) -> Option<Match> {
debug_assert!(!is_all_zeroes128(cand));
debug_assert_eq!(8, self.buckets.len());
let parts = unpack64x128(cand);
for (i, &part) in parts.iter().enumerate() {
let pos = at + i * 8;
if let Some(m) = self.verify64(pats, 8, haystack, pos, part) {
return Some(m);
}
}
None
}
#[inline(always)]
unsafe fn verify256(
&self,
pats: &Patterns,
haystack: &[u8],
at: usize,
cand: __m256i,
) -> Option<Match> {
debug_assert!(!is_all_zeroes256(cand));
debug_assert_eq!(8, self.buckets.len());
let parts = unpack64x256(cand);
for (i, &part) in parts.iter().enumerate() {
let pos = at + i * 8;
if let Some(m) = self.verify64(pats, 8, haystack, pos, part) {
return Some(m);
}
}
None
}
#[inline(always)]
unsafe fn verify_fat256(
&self,
pats: &Patterns,
haystack: &[u8],
at: usize,
cand: __m256i,
) -> Option<Match> {
debug_assert!(!is_all_zeroes256(cand));
debug_assert_eq!(16, self.buckets.len());
let swap = _mm256_permute4x64_epi64(cand, 0x4E);
let r1 = _mm256_unpacklo_epi8(cand, swap);
let r2 = _mm256_unpackhi_epi8(cand, swap);
let parts = unpacklo64x256(r1, r2);
for (i, &part) in parts.iter().enumerate() {
let pos = at + i * 4;
if let Some(m) = self.verify64(pats, 16, haystack, pos, part) {
return Some(m);
}
}
None
}
#[inline(always)]
fn verify64(
&self,
pats: &Patterns,
bucket_count: usize,
haystack: &[u8],
at: usize,
mut cand: u64,
) -> Option<Match> {
debug_assert!(bucket_count == 8 || bucket_count == 16);
while cand != 0 {
let bit = cand.trailing_zeros() as usize;
cand &= !(1 << bit);
let at = at + (bit / bucket_count);
let bucket = bit % bucket_count;
if let Some(m) = self.verify_bucket(pats, haystack, bucket, at) {
return Some(m);
}
}
None
}
#[inline(always)]
fn verify_bucket(
&self,
pats: &Patterns,
haystack: &[u8],
bucket: usize,
at: usize,
) -> Option<Match> {
#[cold]
#[inline(never)]
fn match_from_span(
pati: PatternID,
start: usize,
end: usize,
) -> Match {
Match::from_span(pati as usize, start, end)
}
for &pati in &self.buckets[bucket] {
let pat = unsafe { pats.get_unchecked(pati) };
if pat.is_prefix(&haystack[at..]) {
return Some(match_from_span(pati, at, at + pat.len()));
}
}
None
}
}
#[derive(Clone, Debug)]
pub enum Exec {
TeddySlim1Mask128(TeddySlim1Mask128),
TeddySlim1Mask256(TeddySlim1Mask256),
TeddyFat1Mask256(TeddyFat1Mask256),
TeddySlim2Mask128(TeddySlim2Mask128),
TeddySlim2Mask256(TeddySlim2Mask256),
TeddyFat2Mask256(TeddyFat2Mask256),
TeddySlim3Mask128(TeddySlim3Mask128),
TeddySlim3Mask256(TeddySlim3Mask256),
TeddyFat3Mask256(TeddyFat3Mask256),
}
#[derive(Clone, Debug)]
pub struct TeddySlim1Mask128 {
pub mask1: Mask128,
}
impl TeddySlim1Mask128 {
#[target_feature(enable = "ssse3")]
unsafe fn find_at(
&self,
pats: &Patterns,
teddy: &Teddy,
haystack: &[u8],
mut at: usize,
) -> Option<Match> {
debug_assert!(haystack[at..].len() >= teddy.minimum_len());
assert_eq!(8, teddy.buckets.len());
let len = haystack.len();
while at <= len - 16 {
let c = self.candidate(haystack, at);
if !is_all_zeroes128(c) {
if let Some(m) = teddy.verify128(pats, haystack, at, c) {
return Some(m);
}
}
at += 16;
}
if at < len {
at = len - 16;
let c = self.candidate(haystack, at);
if !is_all_zeroes128(c) {
if let Some(m) = teddy.verify128(pats, haystack, at, c) {
return Some(m);
}
}
}
None
}
#[inline(always)]
unsafe fn candidate(&self, haystack: &[u8], at: usize) -> __m128i {
debug_assert!(haystack[at..].len() >= 16);
let chunk = loadu128(haystack, at);
members1m128(chunk, self.mask1)
}
}
#[derive(Clone, Debug)]
pub struct TeddySlim1Mask256 {
pub mask1: Mask256,
}
impl TeddySlim1Mask256 {
#[target_feature(enable = "avx2")]
unsafe fn find_at(
&self,
pats: &Patterns,
teddy: &Teddy,
haystack: &[u8],
mut at: usize,
) -> Option<Match> {
debug_assert!(haystack[at..].len() >= teddy.minimum_len());
assert_eq!(8, teddy.buckets.len());
let len = haystack.len();
while at <= len - 32 {
let c = self.candidate(haystack, at);
if !is_all_zeroes256(c) {
if let Some(m) = teddy.verify256(pats, haystack, at, c) {
return Some(m);
}
}
at += 32;
}
if at < len {
at = len - 32;
let c = self.candidate(haystack, at);
if !is_all_zeroes256(c) {
if let Some(m) = teddy.verify256(pats, haystack, at, c) {
return Some(m);
}
}
}
None
}
#[inline(always)]
unsafe fn candidate(&self, haystack: &[u8], at: usize) -> __m256i {
debug_assert!(haystack[at..].len() >= 32);
let chunk = loadu256(haystack, at);
members1m256(chunk, self.mask1)
}
}
#[derive(Clone, Debug)]
pub struct TeddyFat1Mask256 {
pub mask1: Mask256,
}
impl TeddyFat1Mask256 {
#[target_feature(enable = "avx2")]
unsafe fn find_at(
&self,
pats: &Patterns,
teddy: &Teddy,
haystack: &[u8],
mut at: usize,
) -> Option<Match> {
debug_assert!(haystack[at..].len() >= teddy.minimum_len());
assert_eq!(16, teddy.buckets.len());
let len = haystack.len();
while at <= len - 16 {
let c = self.candidate(haystack, at);
if !is_all_zeroes256(c) {
if let Some(m) = teddy.verify_fat256(pats, haystack, at, c) {
return Some(m);
}
}
at += 16;
}
if at < len {
at = len - 16;
let c = self.candidate(haystack, at);
if !is_all_zeroes256(c) {
if let Some(m) = teddy.verify_fat256(pats, haystack, at, c) {
return Some(m);
}
}
}
None
}
#[inline(always)]
unsafe fn candidate(&self, haystack: &[u8], at: usize) -> __m256i {
debug_assert!(haystack[at..].len() >= 16);
let chunk = _mm256_broadcastsi128_si256(loadu128(haystack, at));
members1m256(chunk, self.mask1)
}
}
#[derive(Clone, Debug)]
pub struct TeddySlim2Mask128 {
pub mask1: Mask128,
pub mask2: Mask128,
}
impl TeddySlim2Mask128 {
#[target_feature(enable = "ssse3")]
unsafe fn find_at(
&self,
pats: &Patterns,
teddy: &Teddy,
haystack: &[u8],
mut at: usize,
) -> Option<Match> {
debug_assert!(haystack[at..].len() >= teddy.minimum_len());
assert_eq!(8, teddy.buckets.len());
at += 1;
let len = haystack.len();
let mut prev0 = ones128();
while at <= len - 16 {
let c = self.candidate(haystack, at, &mut prev0);
if !is_all_zeroes128(c) {
if let Some(m) = teddy.verify128(pats, haystack, at - 1, c) {
return Some(m);
}
}
at += 16;
}
if at < len {
at = len - 16;
prev0 = ones128();
let c = self.candidate(haystack, at, &mut prev0);
if !is_all_zeroes128(c) {
if let Some(m) = teddy.verify128(pats, haystack, at - 1, c) {
return Some(m);
}
}
}
None
}
#[inline(always)]
unsafe fn candidate(
&self,
haystack: &[u8],
at: usize,
prev0: &mut __m128i,
) -> __m128i {
debug_assert!(haystack[at..].len() >= 16);
let chunk = loadu128(haystack, at);
let (res0, res1) = members2m128(chunk, self.mask1, self.mask2);
let res0prev0 = _mm_alignr_epi8(res0, *prev0, 15);
_mm_and_si128(res0prev0, res1)
}
}
#[derive(Clone, Debug)]
pub struct TeddySlim2Mask256 {
pub mask1: Mask256,
pub mask2: Mask256,
}
impl TeddySlim2Mask256 {
#[target_feature(enable = "avx2")]
unsafe fn find_at(
&self,
pats: &Patterns,
teddy: &Teddy,
haystack: &[u8],
mut at: usize,
) -> Option<Match> {
debug_assert!(haystack[at..].len() >= teddy.minimum_len());
assert_eq!(8, teddy.buckets.len());
at += 1;
let len = haystack.len();
let mut prev0 = ones256();
while at <= len - 32 {
let c = self.candidate(haystack, at, &mut prev0);
if !is_all_zeroes256(c) {
if let Some(m) = teddy.verify256(pats, haystack, at - 1, c) {
return Some(m);
}
}
at += 32;
}
if at < len {
at = len - 32;
prev0 = ones256();
let c = self.candidate(haystack, at, &mut prev0);
if !is_all_zeroes256(c) {
if let Some(m) = teddy.verify256(pats, haystack, at - 1, c) {
return Some(m);
}
}
}
None
}
#[inline(always)]
unsafe fn candidate(
&self,
haystack: &[u8],
at: usize,
prev0: &mut __m256i,
) -> __m256i {
debug_assert!(haystack[at..].len() >= 32);
let chunk = loadu256(haystack, at);
let (res0, res1) = members2m256(chunk, self.mask1, self.mask2);
let res0prev0 = alignr256_15(res0, *prev0);
let res = _mm256_and_si256(res0prev0, res1);
*prev0 = res0;
res
}
}
#[derive(Clone, Debug)]
pub struct TeddyFat2Mask256 {
pub mask1: Mask256,
pub mask2: Mask256,
}
impl TeddyFat2Mask256 {
#[target_feature(enable = "avx2")]
unsafe fn find_at(
&self,
pats: &Patterns,
teddy: &Teddy,
haystack: &[u8],
mut at: usize,
) -> Option<Match> {
debug_assert!(haystack[at..].len() >= teddy.minimum_len());
assert_eq!(16, teddy.buckets.len());
at += 1;
let len = haystack.len();
let mut prev0 = ones256();
while at <= len - 16 {
let c = self.candidate(haystack, at, &mut prev0);
if !is_all_zeroes256(c) {
if let Some(m) = teddy.verify_fat256(pats, haystack, at - 1, c)
{
return Some(m);
}
}
at += 16;
}
if at < len {
at = len - 16;
prev0 = ones256();
let c = self.candidate(haystack, at, &mut prev0);
if !is_all_zeroes256(c) {
if let Some(m) = teddy.verify_fat256(pats, haystack, at - 1, c)
{
return Some(m);
}
}
}
None
}
#[inline(always)]
unsafe fn candidate(
&self,
haystack: &[u8],
at: usize,
prev0: &mut __m256i,
) -> __m256i {
debug_assert!(haystack[at..].len() >= 16);
let chunk = _mm256_broadcastsi128_si256(loadu128(haystack, at));
let (res0, res1) = members2m256(chunk, self.mask1, self.mask2);
let res0prev0 = _mm256_alignr_epi8(res0, *prev0, 15);
let res = _mm256_and_si256(res0prev0, res1);
*prev0 = res0;
res
}
}
#[derive(Clone, Debug)]
pub struct TeddySlim3Mask128 {
pub mask1: Mask128,
pub mask2: Mask128,
pub mask3: Mask128,
}
impl TeddySlim3Mask128 {
#[target_feature(enable = "ssse3")]
unsafe fn find_at(
&self,
pats: &Patterns,
teddy: &Teddy,
haystack: &[u8],
mut at: usize,
) -> Option<Match> {
debug_assert!(haystack[at..].len() >= teddy.minimum_len());
assert_eq!(8, teddy.buckets.len());
at += 2;
let len = haystack.len();
let (mut prev0, mut prev1) = (ones128(), ones128());
while at <= len - 16 {
let c = self.candidate(haystack, at, &mut prev0, &mut prev1);
if !is_all_zeroes128(c) {
if let Some(m) = teddy.verify128(pats, haystack, at - 2, c) {
return Some(m);
}
}
at += 16;
}
if at < len {
at = len - 16;
prev0 = ones128();
prev1 = ones128();
let c = self.candidate(haystack, at, &mut prev0, &mut prev1);
if !is_all_zeroes128(c) {
if let Some(m) = teddy.verify128(pats, haystack, at - 2, c) {
return Some(m);
}
}
}
None
}
#[inline(always)]
unsafe fn candidate(
&self,
haystack: &[u8],
at: usize,
prev0: &mut __m128i,
prev1: &mut __m128i,
) -> __m128i {
debug_assert!(haystack[at..].len() >= 16);
let chunk = loadu128(haystack, at);
let (res0, res1, res2) =
members3m128(chunk, self.mask1, self.mask2, self.mask3);
let res0prev0 = _mm_alignr_epi8(res0, *prev0, 14);
let res1prev1 = _mm_alignr_epi8(res1, *prev1, 15);
let res = _mm_and_si128(_mm_and_si128(res0prev0, res1prev1), res2);
*prev0 = res0;
*prev1 = res1;
res
}
}
#[derive(Clone, Debug)]
pub struct TeddySlim3Mask256 {
pub mask1: Mask256,
pub mask2: Mask256,
pub mask3: Mask256,
}
impl TeddySlim3Mask256 {
#[target_feature(enable = "avx2")]
unsafe fn find_at(
&self,
pats: &Patterns,
teddy: &Teddy,
haystack: &[u8],
mut at: usize,
) -> Option<Match> {
debug_assert!(haystack[at..].len() >= teddy.minimum_len());
assert_eq!(8, teddy.buckets.len());
at += 2;
let len = haystack.len();
let (mut prev0, mut prev1) = (ones256(), ones256());
while at <= len - 32 {
let c = self.candidate(haystack, at, &mut prev0, &mut prev1);
if !is_all_zeroes256(c) {
if let Some(m) = teddy.verify256(pats, haystack, at - 2, c) {
return Some(m);
}
}
at += 32;
}
if at < len {
at = len - 32;
prev0 = ones256();
prev1 = ones256();
let c = self.candidate(haystack, at, &mut prev0, &mut prev1);
if !is_all_zeroes256(c) {
if let Some(m) = teddy.verify256(pats, haystack, at - 2, c) {
return Some(m);
}
}
}
None
}
#[inline(always)]
unsafe fn candidate(
&self,
haystack: &[u8],
at: usize,
prev0: &mut __m256i,
prev1: &mut __m256i,
) -> __m256i {
debug_assert!(haystack[at..].len() >= 32);
let chunk = loadu256(haystack, at);
let (res0, res1, res2) =
members3m256(chunk, self.mask1, self.mask2, self.mask3);
let res0prev0 = alignr256_14(res0, *prev0);
let res1prev1 = alignr256_15(res1, *prev1);
let res =
_mm256_and_si256(_mm256_and_si256(res0prev0, res1prev1), res2);
*prev0 = res0;
*prev1 = res1;
res
}
}
#[derive(Clone, Debug)]
pub struct TeddyFat3Mask256 {
pub mask1: Mask256,
pub mask2: Mask256,
pub mask3: Mask256,
}
impl TeddyFat3Mask256 {
#[target_feature(enable = "avx2")]
unsafe fn find_at(
&self,
pats: &Patterns,
teddy: &Teddy,
haystack: &[u8],
mut at: usize,
) -> Option<Match> {
debug_assert!(haystack[at..].len() >= teddy.minimum_len());
assert_eq!(16, teddy.buckets.len());
at += 2;
let len = haystack.len();
let (mut prev0, mut prev1) = (ones256(), ones256());
while at <= len - 16 {
let c = self.candidate(haystack, at, &mut prev0, &mut prev1);
if !is_all_zeroes256(c) {
if let Some(m) = teddy.verify_fat256(pats, haystack, at - 2, c)
{
return Some(m);
}
}
at += 16;
}
if at < len {
at = len - 16;
prev0 = ones256();
prev1 = ones256();
let c = self.candidate(haystack, at, &mut prev0, &mut prev1);
if !is_all_zeroes256(c) {
if let Some(m) = teddy.verify_fat256(pats, haystack, at - 2, c)
{
return Some(m);
}
}
}
None
}
#[inline(always)]
unsafe fn candidate(
&self,
haystack: &[u8],
at: usize,
prev0: &mut __m256i,
prev1: &mut __m256i,
) -> __m256i {
debug_assert!(haystack[at..].len() >= 16);
let chunk = _mm256_broadcastsi128_si256(loadu128(haystack, at));
let (res0, res1, res2) =
members3m256(chunk, self.mask1, self.mask2, self.mask3);
let res0prev0 = _mm256_alignr_epi8(res0, *prev0, 14);
let res1prev1 = _mm256_alignr_epi8(res1, *prev1, 15);
let res =
_mm256_and_si256(_mm256_and_si256(res0prev0, res1prev1), res2);
*prev0 = res0;
*prev1 = res1;
res
}
}
#[derive(Clone, Copy, Debug)]
pub struct Mask128 {
lo: __m128i,
hi: __m128i,
}
impl Mask128 {
pub fn new(mask: compile::Mask) -> Mask128 {
unsafe {
Mask128 {
lo: mem::transmute(mask.lo128()),
hi: mem::transmute(mask.hi128()),
}
}
}
}
#[derive(Clone, Copy, Debug)]
pub struct Mask256 {
lo: __m256i,
hi: __m256i,
}
impl Mask256 {
pub fn new(mask: compile::Mask) -> Mask256 {
unsafe {
Mask256 {
lo: mem::transmute(mask.lo256()),
hi: mem::transmute(mask.hi256()),
}
}
}
}
#[target_feature(enable = "ssse3")]
unsafe fn members1m128(chunk: __m128i, mask1: Mask128) -> __m128i {
let lomask = _mm_set1_epi8(0xF);
let hlo = _mm_and_si128(chunk, lomask);
let hhi = _mm_and_si128(_mm_srli_epi16(chunk, 4), lomask);
_mm_and_si128(
_mm_shuffle_epi8(mask1.lo, hlo),
_mm_shuffle_epi8(mask1.hi, hhi),
)
}
#[target_feature(enable = "avx2")]
unsafe fn members1m256(chunk: __m256i, mask1: Mask256) -> __m256i {
let lomask = _mm256_set1_epi8(0xF);
let hlo = _mm256_and_si256(chunk, lomask);
let hhi = _mm256_and_si256(_mm256_srli_epi16(chunk, 4), lomask);
_mm256_and_si256(
_mm256_shuffle_epi8(mask1.lo, hlo),
_mm256_shuffle_epi8(mask1.hi, hhi),
)
}
#[target_feature(enable = "ssse3")]
unsafe fn members2m128(
chunk: __m128i,
mask1: Mask128,
mask2: Mask128,
) -> (__m128i, __m128i) {
let lomask = _mm_set1_epi8(0xF);
let hlo = _mm_and_si128(chunk, lomask);
let hhi = _mm_and_si128(_mm_srli_epi16(chunk, 4), lomask);
let res0 = _mm_and_si128(
_mm_shuffle_epi8(mask1.lo, hlo),
_mm_shuffle_epi8(mask1.hi, hhi),
);
let res1 = _mm_and_si128(
_mm_shuffle_epi8(mask2.lo, hlo),
_mm_shuffle_epi8(mask2.hi, hhi),
);
(res0, res1)
}
#[target_feature(enable = "avx2")]
unsafe fn members2m256(
chunk: __m256i,
mask1: Mask256,
mask2: Mask256,
) -> (__m256i, __m256i) {
let lomask = _mm256_set1_epi8(0xF);
let hlo = _mm256_and_si256(chunk, lomask);
let hhi = _mm256_and_si256(_mm256_srli_epi16(chunk, 4), lomask);
let res0 = _mm256_and_si256(
_mm256_shuffle_epi8(mask1.lo, hlo),
_mm256_shuffle_epi8(mask1.hi, hhi),
);
let res1 = _mm256_and_si256(
_mm256_shuffle_epi8(mask2.lo, hlo),
_mm256_shuffle_epi8(mask2.hi, hhi),
);
(res0, res1)
}
#[target_feature(enable = "ssse3")]
unsafe fn members3m128(
chunk: __m128i,
mask1: Mask128,
mask2: Mask128,
mask3: Mask128,
) -> (__m128i, __m128i, __m128i) {
let lomask = _mm_set1_epi8(0xF);
let hlo = _mm_and_si128(chunk, lomask);
let hhi = _mm_and_si128(_mm_srli_epi16(chunk, 4), lomask);
let res0 = _mm_and_si128(
_mm_shuffle_epi8(mask1.lo, hlo),
_mm_shuffle_epi8(mask1.hi, hhi),
);
let res1 = _mm_and_si128(
_mm_shuffle_epi8(mask2.lo, hlo),
_mm_shuffle_epi8(mask2.hi, hhi),
);
let res2 = _mm_and_si128(
_mm_shuffle_epi8(mask3.lo, hlo),
_mm_shuffle_epi8(mask3.hi, hhi),
);
(res0, res1, res2)
}
#[target_feature(enable = "avx2")]
unsafe fn members3m256(
chunk: __m256i,
mask1: Mask256,
mask2: Mask256,
mask3: Mask256,
) -> (__m256i, __m256i, __m256i) {
let lomask = _mm256_set1_epi8(0xF);
let hlo = _mm256_and_si256(chunk, lomask);
let hhi = _mm256_and_si256(_mm256_srli_epi16(chunk, 4), lomask);
let res0 = _mm256_and_si256(
_mm256_shuffle_epi8(mask1.lo, hlo),
_mm256_shuffle_epi8(mask1.hi, hhi),
);
let res1 = _mm256_and_si256(
_mm256_shuffle_epi8(mask2.lo, hlo),
_mm256_shuffle_epi8(mask2.hi, hhi),
);
let res2 = _mm256_and_si256(
_mm256_shuffle_epi8(mask3.lo, hlo),
_mm256_shuffle_epi8(mask3.hi, hhi),
);
(res0, res1, res2)
}