#![allow(unsafe_op_in_unsafe_fn)]
#[cfg(target_arch = "aarch64")]
use core::arch::aarch64::*;
static BITMASK: [u8; 16] = [1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128];
#[inline(always)]
unsafe fn bulk_movemask_4x16(
c0: uint8x16_t,
c1: uint8x16_t,
c2: uint8x16_t,
c3: uint8x16_t,
bm: uint8x16_t,
) -> u64 {
let t0 = vandq_u8(c0, bm);
let t1 = vandq_u8(c1, bm);
let t2 = vandq_u8(c2, bm);
let t3 = vandq_u8(c3, bm);
let p01 = vpaddq_u8(t0, t1);
let p23 = vpaddq_u8(t2, t3);
let p0123 = vpaddq_u8(p01, p23);
let r = vpaddq_u8(p0123, p0123);
vgetq_lane_u64(vreinterpretq_u64_u8(r), 0)
}
macro_rules! struct_or {
($v:expr) => {{
let v_merged = vorrq_u8($v, vdupq_n_u8(32));
vorrq_u8(
vorrq_u8(
vorrq_u8(
vceqq_u8(v_merged, vdupq_n_u8(123)),
vceqq_u8(v_merged, vdupq_n_u8(125)),
),
vceqq_u8(v_merged, vdupq_n_u8(58)),
),
vceqq_u8(v_merged, vdupq_n_u8(44)),
)
}};
}
#[cfg(target_arch = "aarch64")]
#[inline(always)]
pub unsafe fn scan_neon(bytes: &[u8], tape: &mut [u32]) -> usize {
use crate::scanner::tag_byte;
let mut tape_idx = 0;
let mut i = 0;
let mut prev_in_string: u64 = 0;
let q_splat = vdupq_n_u8(b'"');
let bm = vld1q_u8(BITMASK.as_ptr());
while i + 64 <= bytes.len() {
let v0 = vld1q_u8(bytes.as_ptr().add(i));
let v1 = vld1q_u8(bytes.as_ptr().add(i + 16));
let v2 = vld1q_u8(bytes.as_ptr().add(i + 32));
let v3 = vld1q_u8(bytes.as_ptr().add(i + 48));
let q64 = bulk_movemask_4x16(
vceqq_u8(v0, q_splat),
vceqq_u8(v1, q_splat),
vceqq_u8(v2, q_splat),
vceqq_u8(v3, q_splat),
bm,
);
let s64 = bulk_movemask_4x16(
struct_or!(v0),
struct_or!(v1),
struct_or!(v2),
struct_or!(v3),
bm,
);
let cumulative: u64 = vmull_p64(q64, !0u64) as u64;
let string64 = cumulative ^ prev_in_string;
prev_in_string = ((string64 as i64) >> 63) as u64;
let string1 = string64 as u32;
let string2 = (string64 >> 32) as u32;
let q1 = q64 as u32;
let q2 = (q64 >> 32) as u32;
let s1 = s64 as u32;
let s2 = (s64 >> 32) as u32;
let mut active1 = (s1 & !string1) | q1;
let mut active2 = (s2 & !string2) | q2;
while active1 != 0 {
let tz = active1.trailing_zeros();
active1 &= active1 - 1;
let pos = i + tz as usize;
*tape.get_unchecked_mut(tape_idx) = tag_byte(*bytes.get_unchecked(pos), pos);
tape_idx += 1;
}
while active2 != 0 {
let tz = active2.trailing_zeros();
active2 &= active2 - 1;
let pos = i + 32 + tz as usize;
*tape.get_unchecked_mut(tape_idx) = tag_byte(*bytes.get_unchecked(pos), pos);
tape_idx += 1;
}
i += 64;
}
while i + 32 <= bytes.len() {
let v0 = vld1q_u8(bytes.as_ptr().add(i));
let v1 = vld1q_u8(bytes.as_ptr().add(i + 16));
let zero = vdupq_n_u8(0);
let q32 =
bulk_movemask_4x16(vceqq_u8(v0, q_splat), vceqq_u8(v1, q_splat), zero, zero, bm) as u32;
let s32 = bulk_movemask_4x16(struct_or!(v0), struct_or!(v1), zero, zero, bm) as u32;
let cumulative: u64 = vmull_p64(q32 as u64, !0u64) as u64;
let string32 = (cumulative ^ prev_in_string) as u32;
prev_in_string = ((string32 as i32) >> 31) as u64;
let mut active = (s32 & !string32) | q32;
while active != 0 {
let tz = active.trailing_zeros();
active &= active - 1;
let pos = i + tz as usize;
*tape.get_unchecked_mut(tape_idx) = tag_byte(*bytes.get_unchecked(pos), pos);
tape_idx += 1;
}
i += 32;
}
{
let mut in_string = prev_in_string != 0;
let mut escape = false;
while i < bytes.len() {
let b = *bytes.get_unchecked(i);
if escape {
escape = false;
} else if b == b'\\' && in_string {
escape = true;
} else if b == b'"' {
if tape_idx < tape.len() {
*tape.get_unchecked_mut(tape_idx) = tag_byte(b, i);
tape_idx += 1;
}
in_string = !in_string;
} else if !in_string {
match b {
b'{' | b'}' | b'[' | b']' | b':' | b',' if tape_idx < tape.len() => {
*tape.get_unchecked_mut(tape_idx) = tag_byte(b, i);
tape_idx += 1;
}
_ => {}
}
}
i += 1;
}
}
tape_idx
}
#[cfg(not(target_arch = "aarch64"))]
pub unsafe fn scan_neon(_bytes: &[u8], _tape: &mut [u32]) -> usize {
unreachable!("scan_neon called on non-aarch64 architecture")
}