#![allow(unsafe_op_in_unsafe_fn)]
#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::*;
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2,pclmulqdq")]
pub unsafe fn scan_avx2(bytes: &[u8], tape: &mut [u32]) -> usize {
use crate::scanner::tag_byte;
let mut tape_idx = 0usize;
let mut i = 0usize;
let mut prev_in_string: u64 = 0;
let q_splat = _mm256_set1_epi8(b'"' as i8);
let lb_splat = _mm256_set1_epi8(b'{' as i8);
let rb_splat = _mm256_set1_epi8(b'}' as i8);
let lsb_splat = _mm256_set1_epi8(b'[' as i8);
let rsb_splat = _mm256_set1_epi8(b']' as i8);
let col_splat = _mm256_set1_epi8(b':' as i8);
let com_splat = _mm256_set1_epi8(b',' as i8);
let m_123 = _mm256_set1_epi8(123); let m_125 = _mm256_set1_epi8(125); let m_58 = _mm256_set1_epi8(58); let m_44 = _mm256_set1_epi8(44); let or_32 = _mm256_set1_epi8(32);
let clmul_ones = _mm_cvtsi64_si128(!0i64);
macro_rules! struct_or {
($v:expr) => {{
let v_merged = _mm256_or_si256($v, or_32);
_mm256_or_si256(
_mm256_or_si256(
_mm256_cmpeq_epi8(v_merged, m_123),
_mm256_cmpeq_epi8(v_merged, m_125),
),
_mm256_or_si256(
_mm256_cmpeq_epi8(v_merged, m_58),
_mm256_cmpeq_epi8(v_merged, m_44),
),
)
}};
}
while i + 64 <= bytes.len() {
let ptr = bytes.as_ptr().add(i);
let v0 = _mm256_loadu_si256(ptr as *const __m256i);
let v1 = _mm256_loadu_si256(ptr.add(32) as *const __m256i);
let qm0 = _mm256_movemask_epi8(_mm256_cmpeq_epi8(v0, q_splat)) as u32;
let qm1 = _mm256_movemask_epi8(_mm256_cmpeq_epi8(v1, q_splat)) as u32;
let quote_mask: u64 = (qm1 as u64) << 32 | (qm0 as u64);
let sm0 = _mm256_movemask_epi8(struct_or!(v0)) as u32;
let sm1 = _mm256_movemask_epi8(struct_or!(v1)) as u32;
let struct_mask: u64 = (sm1 as u64) << 32 | (sm0 as u64);
let q_vec = _mm_cvtsi64_si128(quote_mask as i64);
let prod = _mm_clmulepi64_si128(q_vec, clmul_ones, 0x00);
let cumulative = _mm_cvtsi128_si64(prod) as u64;
let string64 = cumulative ^ prev_in_string;
prev_in_string = ((string64 as i64) >> 63) as u64;
let string_lo = string64 as u32;
let string_hi = (string64 >> 32) as u32;
let q_lo = quote_mask as u32;
let q_hi = (quote_mask >> 32) as u32;
let s_lo = struct_mask as u32;
let s_hi = (struct_mask >> 32) as u32;
let mut active_lo = (s_lo & !string_lo) | q_lo;
let mut active_hi = (s_hi & !string_hi) | q_hi;
while active_lo != 0 {
let tz = active_lo.trailing_zeros();
active_lo &= active_lo - 1;
let pos = i + tz as usize;
*tape.get_unchecked_mut(tape_idx) = tag_byte(*bytes.get_unchecked(pos), pos);
tape_idx += 1;
}
while active_hi != 0 {
let tz = active_hi.trailing_zeros();
active_hi &= active_hi - 1;
let pos = i + 32 + tz as usize;
*tape.get_unchecked_mut(tape_idx) = tag_byte(*bytes.get_unchecked(pos), pos);
tape_idx += 1;
}
i += 64;
}
while i + 32 <= bytes.len() {
let ptr = bytes.as_ptr().add(i);
let v0 = _mm256_loadu_si256(ptr as *const __m256i);
let quote_mask32 = _mm256_movemask_epi8(_mm256_cmpeq_epi8(v0, q_splat)) as u32;
let struct_mask32 = _mm256_movemask_epi8(struct_or!(v0)) as u32;
let q_vec = _mm_cvtsi64_si128(quote_mask32 as i64);
let prod = _mm_clmulepi64_si128(q_vec, clmul_ones, 0x00);
let cumulative32 = _mm_cvtsi128_si64(prod) as u32;
let string32 = cumulative32 ^ prev_in_string as u32;
prev_in_string = ((string32 as i32) >> 31) as u64;
let mut active = (struct_mask32 & !string32) | quote_mask32;
while active != 0 {
let tz = active.trailing_zeros();
active &= active - 1;
let pos = i + tz as usize;
*tape.get_unchecked_mut(tape_idx) = tag_byte(*bytes.get_unchecked(pos), pos);
tape_idx += 1;
}
i += 32;
}
{
let mut in_string = prev_in_string != 0;
let mut escape = false;
while i < bytes.len() {
let b = *bytes.get_unchecked(i);
if escape {
escape = false;
} else if b == b'\\' && in_string {
escape = true;
} else if b == b'"' {
if tape_idx < tape.len() {
*tape.get_unchecked_mut(tape_idx) = tag_byte(b, i);
tape_idx += 1;
}
in_string = !in_string;
} else if !in_string {
match b {
b'{' | b'}' | b'[' | b']' | b':' | b',' => {
if tape_idx < tape.len() {
*tape.get_unchecked_mut(tape_idx) = tag_byte(b, i);
tape_idx += 1;
}
}
_ => {}
}
}
i += 1;
}
}
tape_idx
}
#[cfg(not(target_arch = "x86_64"))]
pub unsafe fn scan_avx2(_bytes: &[u8], _tape: &mut [u32]) -> usize {
unreachable!("scan_avx2 called on non-x86_64 architecture")
}