#![allow(clippy::wildcard_imports)]
use crate::ir::HirClass;
use crate::parser::ast::{CharClass, CharClassItem, NamedClass};
#[derive(Clone, Copy, Debug)]
pub struct AsciiClassBitmap {
lo: u64,
hi: u64,
negated: bool,
matches_non_ascii: bool,
}
impl AsciiClassBitmap {
#[must_use]
pub fn empty() -> Self {
AsciiClassBitmap {
lo: 0,
hi: 0,
negated: false,
matches_non_ascii: false,
}
}
#[must_use]
pub fn all_ascii() -> Self {
AsciiClassBitmap {
lo: u64::MAX,
hi: u64::MAX,
negated: false,
matches_non_ascii: false,
}
}
#[must_use]
pub fn from_char_class(class: &CharClass) -> Self {
let mut bitmap = AsciiClassBitmap::empty();
bitmap.negated = class.negated;
for item in &class.items {
match item {
CharClassItem::Single(ch) => {
if ch.is_ascii() {
bitmap.set(*ch as u8);
} else {
bitmap.matches_non_ascii = true;
}
}
CharClassItem::Range(start, end) => {
let start_byte = if start.is_ascii() { *start as u8 } else { 128 };
let end_byte = if end.is_ascii() { *end as u8 } else { 127 };
for b in start_byte..=end_byte.min(127) {
bitmap.set(b);
}
if *end as u32 > 127 {
bitmap.matches_non_ascii = true;
}
}
CharClassItem::Named(named) => {
bitmap.add_named_class(*named);
}
}
}
bitmap
}
#[must_use]
pub fn from_hir_class(class: &HirClass) -> Self {
let mut bitmap = AsciiClassBitmap::empty();
bitmap.negated = class.negated;
for &ch in &class.chars {
if ch.is_ascii() {
bitmap.set(ch as u8);
} else {
bitmap.matches_non_ascii = true;
}
}
for &(start, end) in &class.ranges {
let start_byte = if start.is_ascii() { start as u8 } else { 128 };
let end_byte = if end.is_ascii() { end as u8 } else { 127 };
for b in start_byte..=end_byte.min(127) {
bitmap.set(b);
}
if end as u32 > 127 {
bitmap.matches_non_ascii = true;
}
}
for &named in &class.named {
bitmap.add_named_class(named);
}
bitmap
}
fn add_named_class(&mut self, class: NamedClass) {
match class {
NamedClass::Digit => {
for b in b'0'..=b'9' {
self.set(b);
}
}
NamedClass::NotDigit => {
for b in 0u8..=127 {
if !b.is_ascii_digit() {
self.set(b);
}
}
self.matches_non_ascii = true;
}
NamedClass::Word => {
for b in b'a'..=b'z' {
self.set(b);
}
for b in b'A'..=b'Z' {
self.set(b);
}
for b in b'0'..=b'9' {
self.set(b);
}
self.set(b'_');
}
NamedClass::NotWord => {
for b in 0u8..=127 {
let is_word = b.is_ascii_lowercase()
|| b.is_ascii_uppercase()
|| b.is_ascii_digit()
|| b == b'_';
if !is_word {
self.set(b);
}
}
self.matches_non_ascii = true;
}
NamedClass::Whitespace => {
self.set(b' ');
self.set(b'\t');
self.set(b'\n');
self.set(b'\r');
self.set(0x0C); self.set(0x0B); }
NamedClass::NotWhitespace => {
for b in 0u8..=127 {
if !matches!(b, b' ' | b'\t' | b'\n' | b'\r' | 0x0C | 0x0B) {
self.set(b);
}
}
self.matches_non_ascii = true;
}
NamedClass::Any | NamedClass::AnyExceptNewline => {
self.lo = u64::MAX;
self.hi = u64::MAX;
if matches!(class, NamedClass::AnyExceptNewline) {
self.clear(b'\n');
self.clear(b'\r');
}
self.matches_non_ascii = true;
}
}
}
#[inline]
fn set(&mut self, byte: u8) {
if byte < 64 {
self.lo |= 1u64 << byte;
} else if byte < 128 {
self.hi |= 1u64 << (byte - 64);
}
}
#[inline]
fn clear(&mut self, byte: u8) {
if byte < 64 {
self.lo &= !(1u64 << byte);
} else if byte < 128 {
self.hi &= !(1u64 << (byte - 64));
}
}
#[inline]
#[must_use]
pub fn contains(&self, byte: u8) -> bool {
let in_bitmap = if byte < 64 {
(self.lo & (1u64 << byte)) != 0
} else if byte < 128 {
(self.hi & (1u64 << (byte - 64))) != 0
} else {
self.matches_non_ascii
};
if self.negated { !in_bitmap } else { in_bitmap }
}
#[inline]
#[must_use]
pub fn contains_char(&self, ch: char) -> bool {
if ch.is_ascii() {
self.contains(ch as u8)
} else {
let in_class = self.matches_non_ascii;
if self.negated { !in_class } else { in_class }
}
}
#[must_use]
pub fn find_first(&self, haystack: &[u8]) -> Option<usize> {
#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
{
if haystack.len() >= 16 {
return self.find_first_simd(haystack);
}
}
#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
{
if haystack.len() >= 16 {
return self.find_first_simd(haystack);
}
}
self.find_first_scalar(haystack)
}
#[inline]
fn find_first_scalar(&self, haystack: &[u8]) -> Option<usize> {
for (i, &byte) in haystack.iter().enumerate() {
if self.contains(byte) {
return Some(i);
}
}
None
}
#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
fn find_first_simd(&self, haystack: &[u8]) -> Option<usize> {
use std::arch::x86_64::*;
if self.negated || self.matches_non_ascii {
return self.find_first_scalar(haystack);
}
let len = haystack.len();
let mut i = 0;
unsafe {
while i + 16 <= len {
let chunk = _mm_loadu_si128(haystack.as_ptr().add(i).cast::<__m128i>());
let mut mask = 0u16;
let bytes: [u8; 16] = std::mem::transmute(chunk);
for (j, &b) in bytes.iter().enumerate() {
if self.contains(b) {
mask |= 1 << j;
}
}
if mask != 0 {
return Some(i + mask.trailing_zeros() as usize);
}
i += 16;
}
}
(i..len).find(|&j| self.contains(haystack[j]))
}
#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
fn find_first_simd(&self, haystack: &[u8]) -> Option<usize> {
if self.negated || self.matches_non_ascii {
return self.find_first_scalar(haystack);
}
let len = haystack.len();
let mut i = 0;
unsafe {
use std::arch::aarch64::*;
while i + 16 <= len {
let chunk = vld1q_u8(haystack.as_ptr().add(i));
let bytes: [u8; 16] = std::mem::transmute(chunk);
for (j, &b) in bytes.iter().enumerate() {
if self.contains(b) {
return Some(i + j);
}
}
i += 16;
}
}
(i..len).find(|&j| self.contains(haystack[j]))
}
#[must_use]
pub fn find_all(&self, haystack: &[u8]) -> Vec<usize> {
let mut results = Vec::new();
let mut pos = 0;
while pos < haystack.len() {
if let Some(offset) = self.find_first(&haystack[pos..]) {
results.push(pos + offset);
pos += offset + 1;
} else {
break;
}
}
results
}
#[must_use]
pub fn count_matches(&self, haystack: &[u8]) -> usize {
haystack.iter().filter(|&&b| self.contains(b)).count()
}
#[inline]
#[must_use]
pub fn matches_any(&self, haystack: &[u8]) -> bool {
self.find_first(haystack).is_some()
}
}
impl Default for AsciiClassBitmap {
fn default() -> Self {
Self::empty()
}
}
#[derive(Clone, Debug)]
pub struct CompiledCharClass {
pub bitmap: AsciiClassBitmap,
pub original: CharClass,
pub unicode: bool,
}
impl CompiledCharClass {
#[must_use]
pub fn new(class: &CharClass) -> Self {
CompiledCharClass {
bitmap: AsciiClassBitmap::from_char_class(class),
original: class.clone(),
unicode: false,
}
}
#[must_use]
pub fn new_with_unicode(class: &CharClass, unicode: bool) -> Self {
CompiledCharClass {
bitmap: AsciiClassBitmap::from_char_class(class),
original: class.clone(),
unicode,
}
}
#[inline]
#[must_use]
pub fn matches(&self, ch: char) -> bool {
if ch.is_ascii() {
self.bitmap.contains(ch as u8)
} else if self.unicode {
self.original.matches_unicode(ch)
} else {
self.original.matches(ch)
}
}
#[inline]
#[must_use]
pub fn find_first(&self, haystack: &[u8]) -> Option<usize> {
self.bitmap.find_first(haystack)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_ascii_bitmap_single() {
let class = CharClass::new(false, vec![CharClassItem::Single('a')]);
let bitmap = AsciiClassBitmap::from_char_class(&class);
assert!(bitmap.contains(b'a'));
assert!(!bitmap.contains(b'b'));
assert!(!bitmap.contains(b'A'));
}
#[test]
fn test_ascii_bitmap_range() {
let class = CharClass::new(false, vec![CharClassItem::Range('a', 'z')]);
let bitmap = AsciiClassBitmap::from_char_class(&class);
assert!(bitmap.contains(b'a'));
assert!(bitmap.contains(b'm'));
assert!(bitmap.contains(b'z'));
assert!(!bitmap.contains(b'A'));
assert!(!bitmap.contains(b'0'));
}
#[test]
fn test_ascii_bitmap_negated() {
let class = CharClass::new(true, vec![CharClassItem::Range('a', 'z')]);
let bitmap = AsciiClassBitmap::from_char_class(&class);
assert!(!bitmap.contains(b'a'));
assert!(!bitmap.contains(b'z'));
assert!(bitmap.contains(b'A'));
assert!(bitmap.contains(b'0'));
assert!(bitmap.contains(b' '));
}
#[test]
fn test_ascii_bitmap_digit() {
let class = CharClass::digit();
let bitmap = AsciiClassBitmap::from_char_class(&class);
for b in b'0'..=b'9' {
assert!(bitmap.contains(b), "Should contain digit {}", b as char);
}
assert!(!bitmap.contains(b'a'));
assert!(!bitmap.contains(b' '));
}
#[test]
fn test_ascii_bitmap_word() {
let class = CharClass::word();
let bitmap = AsciiClassBitmap::from_char_class(&class);
assert!(bitmap.contains(b'a'));
assert!(bitmap.contains(b'Z'));
assert!(bitmap.contains(b'5'));
assert!(bitmap.contains(b'_'));
assert!(!bitmap.contains(b' '));
assert!(!bitmap.contains(b'-'));
}
#[test]
fn test_find_first() {
let class = CharClass::new(false, vec![CharClassItem::Range('a', 'z')]);
let bitmap = AsciiClassBitmap::from_char_class(&class);
assert_eq!(bitmap.find_first(b"123abc"), Some(3));
assert_eq!(bitmap.find_first(b"ABC"), None);
assert_eq!(bitmap.find_first(b"hello"), Some(0));
assert_eq!(bitmap.find_first(b""), None);
}
#[test]
fn test_find_first_long() {
let class = CharClass::new(false, vec![CharClassItem::Single('x')]);
let bitmap = AsciiClassBitmap::from_char_class(&class);
let text = b"0123456789abcdefxyz";
assert_eq!(bitmap.find_first(text), Some(16));
let text2 = b"01234567890123456789x";
assert_eq!(bitmap.find_first(text2), Some(20));
}
#[test]
fn test_find_all() {
let class = CharClass::new(false, vec![CharClassItem::Range('a', 'z')]);
let bitmap = AsciiClassBitmap::from_char_class(&class);
let positions = bitmap.find_all(b"a1b2c3");
assert_eq!(positions, vec![0, 2, 4]);
}
#[test]
fn test_count_matches() {
let class = CharClass::digit();
let bitmap = AsciiClassBitmap::from_char_class(&class);
assert_eq!(bitmap.count_matches(b"abc123def456"), 6);
assert_eq!(bitmap.count_matches(b"no digits"), 0);
}
#[test]
fn test_compiled_char_class() {
let class = CharClass::word();
let compiled = CompiledCharClass::new(&class);
assert!(compiled.matches('a'));
assert!(compiled.matches('Z'));
assert!(compiled.matches('5'));
assert!(!compiled.matches(' '));
}
}