use std::borrow::Cow;
use crate::process::string_pool::get_string_from_pool;
use crate::process::transform::simd::skip_ascii_non_delete_simd;
use crate::process::transform::utf8::decode_utf8_raw;
#[derive(Clone)]
pub(crate) struct DeleteMatcher {
bitset: Cow<'static, [u8]>,
ascii_lut: [u8; 16],
}
impl DeleteMatcher {
pub(crate) fn delete(&self, text: &str) -> Option<String> {
let bytes = text.as_bytes();
let len = bytes.len();
let mut offset = 0usize;
loop {
if offset >= len {
return None;
}
let byte = unsafe { *bytes.get_unchecked(offset) };
if byte < 0x80 {
if (self.ascii_lut[(byte as usize) >> 3] & (1 << (byte & 7))) != 0 {
break;
}
offset += 1;
offset = skip_ascii_non_delete_simd(bytes, offset, &self.ascii_lut);
} else {
let (cp, char_len) = unsafe { decode_utf8_raw(bytes, offset) };
let cp = cp as usize;
if cp / 8 < self.bitset.len() && (self.bitset[cp / 8] & (1 << (cp % 8))) != 0 {
break;
}
offset += char_len;
}
}
let mut result = get_string_from_pool(text.len());
result.push_str(&text[..offset]);
let byte = unsafe { *bytes.get_unchecked(offset) };
if byte < 0x80 {
offset += 1;
} else {
let (_, char_len) = unsafe { decode_utf8_raw(bytes, offset) };
offset += char_len;
}
let mut gap_start = offset;
while offset < len {
let byte = unsafe { *bytes.get_unchecked(offset) };
if byte < 0x80 {
if (self.ascii_lut[(byte as usize) >> 3] & (1 << (byte & 7))) != 0 {
result.push_str(&text[gap_start..offset]);
offset += 1;
gap_start = offset;
} else {
offset += 1;
offset = skip_ascii_non_delete_simd(bytes, offset, &self.ascii_lut);
}
} else {
let (cp, char_len) = unsafe { decode_utf8_raw(bytes, offset) };
let cp = cp as usize;
if cp / 8 < self.bitset.len() && (self.bitset[cp / 8] & (1 << (cp % 8))) != 0 {
result.push_str(&text[gap_start..offset]);
offset += char_len;
gap_start = offset;
} else {
offset += char_len;
}
}
}
result.push_str(&text[gap_start..]);
Some(result)
}
#[inline(always)]
pub(crate) fn filter_bytes<'a>(&'a self, text: &'a str) -> DeleteFilterIterator<'a> {
DeleteFilterIterator {
bytes: text.as_bytes(),
offset: 0,
char_remaining: 0,
ascii_lut: &self.ascii_lut,
bitset: &self.bitset,
}
}
pub(crate) fn new(bitset: &'static [u8]) -> Self {
let mut ascii_lut = [0u8; 16];
let copy_len = bitset.len().min(16);
ascii_lut[..copy_len].copy_from_slice(&bitset[..copy_len]);
Self {
bitset: Cow::Borrowed(bitset),
ascii_lut,
}
}
}
pub(crate) struct DeleteFilterIterator<'a> {
bytes: &'a [u8],
offset: usize,
char_remaining: u8,
ascii_lut: &'a [u8; 16],
bitset: &'a [u8],
}
impl Iterator for DeleteFilterIterator<'_> {
type Item = u8;
#[inline(always)]
fn next(&mut self) -> Option<u8> {
if self.char_remaining > 0 {
let byte = unsafe { *self.bytes.get_unchecked(self.offset) };
self.offset += 1;
self.char_remaining -= 1;
return Some(byte);
}
loop {
if self.offset >= self.bytes.len() {
return None;
}
let byte = unsafe { *self.bytes.get_unchecked(self.offset) };
if byte < 0x80 {
if (self.ascii_lut[(byte as usize) >> 3] & (1 << (byte & 7))) != 0 {
self.offset += 1;
continue;
}
self.offset += 1;
return Some(byte);
}
let (cp, char_len) = unsafe { decode_utf8_raw(self.bytes, self.offset) };
let cp = cp as usize;
if cp / 8 < self.bitset.len() && (self.bitset[cp / 8] & (1 << (cp % 8))) != 0 {
self.offset += char_len;
continue;
}
let first_byte = byte;
self.offset += 1;
self.char_remaining = (char_len - 1) as u8;
return Some(first_byte);
}
}
}