#![doc = include_str!("../README.md")]
#![cfg_attr(not(feature = "std"), no_std)]
#![cfg_attr(docsrs, feature(doc_cfg))]
#![cfg_attr(docsrs, allow(unused_attributes))]
#![deny(missing_docs)]
#[cfg(feature = "std")]
extern crate std;
pub use skip::*;
pub mod skip;
pub mod utils;
mod needles;
pub use needles::Needles;
#[doc(hidden)]
pub mod __macro {
#[cfg(target_arch = "aarch64")]
pub use crate::utils::neon_available;
pub const NEON_CHUNK_SIZE: usize = 16;
pub const SCALAR_THRESHOLD: usize = 32;
#[cfg(target_arch = "aarch64")]
pub use crate::skip::neon::{nibble_mask, range_mask};
#[cfg(target_arch = "aarch64")]
pub use core::arch::aarch64::{uint8x16_t, vceqq_u8, vdupq_n_u8, vld1q_u8, vorrq_u8};
pub const SSE_CHUNK_SIZE: usize = 16;
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
pub use crate::utils::sse42_available;
#[cfg(target_arch = "x86")]
pub use core::arch::x86::{
__m128i, _mm_and_si128, _mm_cmpeq_epi8, _mm_loadu_si128, _mm_min_epu8, _mm_movemask_epi8,
_mm_or_si128, _mm_set1_epi8, _mm_setzero_si128, _mm_sub_epi8,
};
#[cfg(target_arch = "x86_64")]
pub use core::arch::x86_64::{
__m128i, __m256i, _mm_and_si128, _mm_cmpeq_epi8, _mm_loadu_si128, _mm_min_epu8,
_mm_movemask_epi8, _mm_or_si128, _mm_set1_epi8, _mm_setzero_si128, _mm_sub_epi8,
_mm256_and_si256, _mm256_cmpeq_epi8, _mm256_loadu_si256, _mm256_min_epu8, _mm256_movemask_epi8,
_mm256_or_si256, _mm256_set1_epi8, _mm256_setzero_si256, _mm256_sub_epi8,
};
pub const AVX2_CHUNK_SIZE: usize = 32;
#[cfg(target_arch = "x86_64")]
pub use crate::utils::avx2_available;
#[cfg(target_arch = "wasm32")]
pub use crate::utils::simd128_available;
#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
pub use core::arch::wasm32::{
i8x16_bitmask, i8x16_eq, i8x16_splat, i8x16_sub, u8x16_lt, u8x16_splat, v128, v128_and,
v128_load, v128_or,
};
}
#[macro_export]
macro_rules! skip_class {
(
$(#[$attr:meta])*
$vis:vis fn $name:ident
(
$(bytes = [$($byte:expr),+ $(,)?] $(,)?)?
$(ranges = [$($lo:literal ..= $hi:literal),+ $(,)?] $(,)?)?
)
;
) => {
$(#[$attr])*
#[inline(always)]
$vis fn $name(input: &[u8]) -> usize {
#[inline(always)]
fn predicate(b: u8) -> bool {
let _ = b;
false
$( $( || b == $byte )+ )?
$( $( || ::core::matches!(b, $lo..=$hi) )+ )?
}
#[inline(always)]
fn prefix_len_scalar(input: &[u8]) -> usize {
input
.iter()
.position(|&b| !predicate(b))
.unwrap_or(input.len())
}
if input.len() < $crate::__macro::SCALAR_THRESHOLD {
return prefix_len_scalar(input);
}
#[cfg(target_arch = "aarch64")]
{
if $crate::__macro::neon_available() {
return neon_path(input);
}
}
#[cfg(target_arch = "x86_64")]
{
if $crate::__macro::avx2_available() {
return unsafe { avx2_path(input) };
}
}
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
{
if $crate::__macro::sse42_available() {
return unsafe { sse_path(input) };
}
}
#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
{
if $crate::__macro::simd128_available() {
return simd128_path(input);
}
}
return prefix_len_scalar(input);
#[cfg(target_arch = "aarch64")]
#[inline(always)]
fn neon_path(input: &[u8]) -> usize {
use $crate::__macro::{
NEON_CHUNK_SIZE, nibble_mask, range_mask, uint8x16_t, vceqq_u8,
vdupq_n_u8, vld1q_u8, vorrq_u8,
};
#[inline(always)]
fn mask(chunk: uint8x16_t) -> uint8x16_t {
let mut acc = unsafe { vdupq_n_u8(0) };
$( $(
acc = unsafe { vorrq_u8(acc, vceqq_u8(chunk, vdupq_n_u8($byte))) };
)+ )?
$( $(
acc = unsafe { vorrq_u8(acc, range_mask(chunk, $lo, $hi)) };
)+ )?
let _ = chunk; acc
}
let len = input.len();
if len < NEON_CHUNK_SIZE {
return prefix_len_scalar(input);
}
let ptr = input.as_ptr();
let first_chunk_len = prefix_len_scalar(&input[..NEON_CHUNK_SIZE]);
if first_chunk_len != NEON_CHUNK_SIZE {
return first_chunk_len;
}
let mut cur = NEON_CHUNK_SIZE;
while cur + NEON_CHUNK_SIZE <= len {
let chunk = unsafe { vld1q_u8(ptr.add(cur)) };
let cmp = mask(chunk);
let miss_bits = !nibble_mask(cmp);
if miss_bits != 0 {
return cur + (miss_bits.trailing_zeros() / 4) as usize;
}
cur += NEON_CHUNK_SIZE;
}
if cur == len {
return len;
}
let overlap_start = len - NEON_CHUNK_SIZE;
let chunk = unsafe { vld1q_u8(ptr.add(overlap_start)) };
let cmp = mask(chunk);
let already_scanned = cur - overlap_start;
let lane_mask = (!0u64) << (already_scanned * 4);
let miss_bits = !nibble_mask(cmp) & lane_mask;
if miss_bits != 0 {
overlap_start + (miss_bits.trailing_zeros() / 4) as usize
} else {
len
}
}
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[target_feature(enable = "sse4.1")]
unsafe fn sse_path(input: &[u8]) -> usize {
use $crate::__macro::{
SSE_CHUNK_SIZE as CHUNK,
__m128i, _mm_and_si128, _mm_cmpeq_epi8, _mm_loadu_si128, _mm_min_epu8,
_mm_movemask_epi8, _mm_or_si128, _mm_set1_epi8, _mm_setzero_si128, _mm_sub_epi8,
};
#[target_feature(enable = "sse4.1")]
unsafe fn mask(chunk: __m128i) -> __m128i {
let mut acc = unsafe { _mm_setzero_si128() };
$( $(
acc = unsafe {
_mm_or_si128(acc, _mm_cmpeq_epi8(chunk, _mm_set1_epi8($byte as i8)))
};
)+ )?
$( $(
acc = unsafe {
let x = _mm_sub_epi8(chunk, _mm_set1_epi8($lo as i8));
let lim = _mm_set1_epi8(($hi as u8).wrapping_sub($lo as u8) as i8);
_mm_or_si128(acc, _mm_cmpeq_epi8(x, _mm_min_epu8(x, lim)))
};
)+ )?
let _ = chunk;
acc
}
let len = input.len();
if len < CHUNK {
return prefix_len_scalar(input);
}
let ptr = input.as_ptr();
let first = prefix_len_scalar(&input[..CHUNK]);
if first != CHUNK {
return first;
}
let mut cur = CHUNK;
while cur + 2 * CHUNK <= len {
let c0 = unsafe { _mm_loadu_si128(ptr.add(cur) as *const __m128i) };
let c1 = unsafe { _mm_loadu_si128(ptr.add(cur + CHUNK) as *const __m128i) };
let m0 = unsafe { mask(c0) };
let m1 = unsafe { mask(c1) };
let combined = unsafe { _mm_movemask_epi8(_mm_and_si128(m0, m1)) } as u32;
if combined != 0xFFFF {
let b0 = unsafe { _mm_movemask_epi8(m0) } as u32;
if b0 != 0xFFFF {
return cur + ((!b0) & 0xFFFF).trailing_zeros() as usize;
}
let b1 = unsafe { _mm_movemask_epi8(m1) } as u32;
return cur + CHUNK + ((!b1) & 0xFFFF).trailing_zeros() as usize;
}
cur += 2 * CHUNK;
}
while cur + CHUNK <= len {
let chunk = unsafe { _mm_loadu_si128(ptr.add(cur) as *const __m128i) };
let bits = unsafe { _mm_movemask_epi8(mask(chunk)) } as u32;
if bits != 0xFFFF {
return cur + ((!bits) & 0xFFFF).trailing_zeros() as usize;
}
cur += CHUNK;
}
if cur == len {
return len;
}
let overlap_start = len - CHUNK;
let chunk = unsafe { _mm_loadu_si128(ptr.add(overlap_start) as *const __m128i) };
let bits = unsafe { _mm_movemask_epi8(mask(chunk)) } as u32;
let already = cur - overlap_start;
let scan_mask = (!0u32) << already;
let non_match = (!bits) & scan_mask & 0xFFFF;
if non_match != 0 {
overlap_start + non_match.trailing_zeros() as usize
} else {
len
}
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
unsafe fn avx2_path(input: &[u8]) -> usize {
use $crate::__macro::{
AVX2_CHUNK_SIZE as CHUNK,
__m256i, _mm256_and_si256, _mm256_cmpeq_epi8, _mm256_loadu_si256, _mm256_min_epu8,
_mm256_movemask_epi8, _mm256_or_si256, _mm256_set1_epi8, _mm256_setzero_si256,
_mm256_sub_epi8,
};
#[target_feature(enable = "avx2")]
unsafe fn mask256(chunk: __m256i) -> __m256i {
let mut acc = unsafe { _mm256_setzero_si256() };
$( $(
acc = unsafe {
_mm256_or_si256(acc, _mm256_cmpeq_epi8(chunk, _mm256_set1_epi8($byte as i8)))
};
)+ )?
$( $(
acc = unsafe {
let x = _mm256_sub_epi8(chunk, _mm256_set1_epi8($lo as i8));
let lim = _mm256_set1_epi8(($hi as u8).wrapping_sub($lo as u8) as i8);
_mm256_or_si256(acc, _mm256_cmpeq_epi8(x, _mm256_min_epu8(x, lim)))
};
)+ )?
let _ = chunk;
acc
}
let len = input.len();
if len < CHUNK {
return prefix_len_scalar(input);
}
let ptr = input.as_ptr();
let first = prefix_len_scalar(&input[..CHUNK]);
if first != CHUNK {
return first;
}
let mut cur = CHUNK;
while cur + 2 * CHUNK <= len {
let c0 = unsafe { _mm256_loadu_si256(ptr.add(cur) as *const __m256i) };
let c1 = unsafe { _mm256_loadu_si256(ptr.add(cur + CHUNK) as *const __m256i) };
let m0 = unsafe { mask256(c0) };
let m1 = unsafe { mask256(c1) };
let combined = unsafe { _mm256_movemask_epi8(_mm256_and_si256(m0, m1)) } as u32;
if combined != !0u32 {
let b0 = unsafe { _mm256_movemask_epi8(m0) } as u32;
if b0 != !0u32 {
return cur + (!b0).trailing_zeros() as usize;
}
let b1 = unsafe { _mm256_movemask_epi8(m1) } as u32;
return cur + CHUNK + (!b1).trailing_zeros() as usize;
}
cur += 2 * CHUNK;
}
while cur + CHUNK <= len {
let chunk = unsafe { _mm256_loadu_si256(ptr.add(cur) as *const __m256i) };
let bits = unsafe { _mm256_movemask_epi8(mask256(chunk)) } as u32;
if bits != !0u32 {
return cur + (!bits).trailing_zeros() as usize;
}
cur += CHUNK;
}
if cur == len {
return len;
}
let overlap_start = len - CHUNK;
let chunk = unsafe { _mm256_loadu_si256(ptr.add(overlap_start) as *const __m256i) };
let bits = unsafe { _mm256_movemask_epi8(mask256(chunk)) } as u32;
let already = cur - overlap_start;
let scan_mask = (!0u32) << already;
let non_match = (!bits) & scan_mask;
if non_match != 0 {
overlap_start + non_match.trailing_zeros() as usize
} else {
len
}
}
#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
fn simd128_path(input: &[u8]) -> usize {
use $crate::__macro::{
SSE_CHUNK_SIZE as CHUNK,
i8x16_bitmask, i8x16_eq, i8x16_splat, i8x16_sub, u8x16_lt, u8x16_splat,
v128, v128_and, v128_load, v128_or,
};
fn mask_s128(chunk: v128) -> v128 {
let mut acc = u8x16_splat(0);
$( $(
acc = v128_or(acc, i8x16_eq(chunk, i8x16_splat($byte as i8)));
)+ )?
$( $(
{
let width = ($hi as u8).wrapping_sub($lo as u8);
let in_range = if width == 0xFF {
u8x16_splat(0xFF)
} else {
let x = i8x16_sub(chunk, i8x16_splat($lo as i8));
u8x16_lt(x, u8x16_splat(width.wrapping_add(1)))
};
acc = v128_or(acc, in_range);
}
)+ )?
let _ = chunk;
acc
}
let len = input.len();
if len < CHUNK {
return prefix_len_scalar(input);
}
let ptr = input.as_ptr();
let first = prefix_len_scalar(&input[..CHUNK]);
if first != CHUNK {
return first;
}
let mut cur = CHUNK;
while cur + 2 * CHUNK <= len {
let c0 = unsafe { v128_load(ptr.add(cur) as *const v128) };
let c1 = unsafe { v128_load(ptr.add(cur + CHUNK) as *const v128) };
let m0 = mask_s128(c0);
let m1 = mask_s128(c1);
let combined = i8x16_bitmask(v128_and(m0, m1)) as u32;
if combined != 0xFFFF {
let b0 = i8x16_bitmask(m0) as u32;
if b0 != 0xFFFF {
return cur + ((!b0) & 0xFFFF).trailing_zeros() as usize;
}
let b1 = i8x16_bitmask(m1) as u32;
return cur + CHUNK + ((!b1) & 0xFFFF).trailing_zeros() as usize;
}
cur += 2 * CHUNK;
}
while cur + CHUNK <= len {
let chunk = unsafe { v128_load(ptr.add(cur) as *const v128) };
let bits = i8x16_bitmask(mask_s128(chunk)) as u32;
if bits != 0xFFFF {
return cur + ((!bits) & 0xFFFF).trailing_zeros() as usize;
}
cur += CHUNK;
}
if cur == len {
return len;
}
let overlap_start = len - CHUNK;
let chunk = unsafe { v128_load(ptr.add(overlap_start) as *const v128) };
let bits = i8x16_bitmask(mask_s128(chunk)) as u32;
let already = cur - overlap_start;
let scan_mask = (!0u32) << already;
let non_match = (!bits) & scan_mask & 0xFFFF;
if non_match != 0 {
overlap_start + non_match.trailing_zeros() as usize
} else {
len
}
}
}
};
}