#[cfg(feature = "runtime_build")]
use ahash::AHashSet;
use std::borrow::Cow;
use crate::process::string_pool::get_string_from_pool;
use crate::process::transform::simd::skip_ascii_non_delete_simd;
use crate::process::transform::utf8::decode_utf8_raw;
#[cfg(feature = "runtime_build")]
const UNICODE_BITSET_SIZE: usize = 0x110000 / 8;
pub(crate) struct DeleteByteIter<'a> {
source: &'a [u8],
bitset: &'a [u8],
ascii_lut: &'a [u8; 16],
pos: usize,
buf: [u8; 3],
buf_pos: u8,
buf_len: u8,
}
impl<'a> Iterator for DeleteByteIter<'a> {
type Item = u8;
#[inline(always)]
fn next(&mut self) -> Option<u8> {
if self.buf_pos < self.buf_len {
let b = self.buf[self.buf_pos as usize];
self.buf_pos += 1;
return Some(b);
}
loop {
if self.pos >= self.source.len() {
return None;
}
let b = unsafe { *self.source.get_unchecked(self.pos) };
if b < 0x80 {
self.pos += 1;
if (self.ascii_lut[(b as usize) >> 3] & (1 << (b & 7))) != 0 {
continue; }
return Some(b);
}
let (cp, char_len) = unsafe { decode_utf8_raw(self.source, self.pos) };
let cp_usize = cp as usize;
if cp_usize / 8 < self.bitset.len()
&& (self.bitset[cp_usize / 8] & (1 << (cp_usize % 8))) != 0
{
self.pos += char_len;
continue;
}
let first = b;
self.pos += 1;
let rest = char_len - 1;
for i in 0..rest {
self.buf[i] = unsafe { *self.source.get_unchecked(self.pos) };
self.pos += 1;
}
self.buf_pos = 0;
self.buf_len = rest as u8;
return Some(first);
}
}
}
#[derive(Clone)]
pub(crate) struct DeleteMatcher {
bitset: Cow<'static, [u8]>,
ascii_lut: [u8; 16],
}
impl DeleteMatcher {
pub(crate) fn delete(&self, text: &str) -> Option<(String, f32)> {
let bytes = text.as_bytes();
let len = bytes.len();
let mut offset = 0usize;
let mut cont_kept: usize = 0;
loop {
if offset >= len {
return None;
}
let byte = unsafe { *bytes.get_unchecked(offset) };
if byte < 0x80 {
if (self.ascii_lut[(byte as usize) >> 3] & (1 << (byte & 7))) != 0 {
break;
}
offset += 1;
offset = skip_ascii_non_delete_simd(bytes, offset, &self.ascii_lut);
} else {
let (cp, char_len) = unsafe { decode_utf8_raw(bytes, offset) };
let cp = cp as usize;
if cp / 8 < self.bitset.len() && (self.bitset[cp / 8] & (1 << (cp % 8))) != 0 {
break;
}
cont_kept += char_len - 1; offset += char_len;
}
}
let mut result = get_string_from_pool(text.len());
result.push_str(&text[..offset]);
let byte = unsafe { *bytes.get_unchecked(offset) };
if byte < 0x80 {
offset += 1;
} else {
let (_, char_len) = unsafe { decode_utf8_raw(bytes, offset) };
offset += char_len;
}
let mut gap_start = offset;
while offset < len {
let byte = unsafe { *bytes.get_unchecked(offset) };
if byte < 0x80 {
if (self.ascii_lut[(byte as usize) >> 3] & (1 << (byte & 7))) != 0 {
result.push_str(&text[gap_start..offset]);
offset += 1;
gap_start = offset;
} else {
offset += 1;
offset = skip_ascii_non_delete_simd(bytes, offset, &self.ascii_lut);
}
} else {
let (cp, char_len) = unsafe { decode_utf8_raw(bytes, offset) };
let cp = cp as usize;
if cp / 8 < self.bitset.len() && (self.bitset[cp / 8] & (1 << (cp % 8))) != 0 {
result.push_str(&text[gap_start..offset]);
offset += char_len;
gap_start = offset;
} else {
cont_kept += char_len - 1; offset += char_len;
}
}
}
result.push_str(&text[gap_start..]);
let output_density = if result.is_empty() {
0.0
} else {
cont_kept as f32 / result.len() as f32
};
Some((result, output_density))
}
#[inline(always)]
pub(crate) fn byte_iter<'a>(&'a self, text: &'a str) -> DeleteByteIter<'a> {
DeleteByteIter {
source: text.as_bytes(),
bitset: &self.bitset,
ascii_lut: &self.ascii_lut,
pos: 0,
buf: [0; 3],
buf_pos: 0,
buf_len: 0,
}
}
#[cfg(not(feature = "runtime_build"))]
pub(crate) fn new(bitset: &'static [u8]) -> Self {
let mut ascii_lut = [0u8; 16];
let copy_len = bitset.len().min(16);
ascii_lut[..copy_len].copy_from_slice(&bitset[..copy_len]);
Self {
bitset: Cow::Borrowed(bitset),
ascii_lut,
}
}
#[cfg(feature = "runtime_build")]
pub(crate) fn from_sources(text_delete: &str) -> Self {
let mut bitset = vec![0u8; UNICODE_BITSET_SIZE];
let mut codepoints = AHashSet::new();
for token in text_delete.trim().lines() {
codepoints.insert(parse_delete_codepoint(token));
}
for cp in codepoints {
let cp = cp as usize;
bitset[cp / 8] |= 1 << (cp % 8);
}
let mut ascii_lut = [0u8; 16];
ascii_lut.copy_from_slice(&bitset[..16]);
Self {
bitset: Cow::Owned(bitset),
ascii_lut,
}
}
}
#[cfg(feature = "runtime_build")]
fn parse_delete_codepoint(token: &str) -> u32 {
u32::from_str_radix(
token
.strip_prefix("U+")
.expect("TEXT-DELETE entries must use U+XXXX format"),
16,
)
.expect("TEXT-DELETE entry must contain a valid hexadecimal codepoint")
}
#[cfg(all(test, not(feature = "runtime_build")))]
mod tests {
use super::*;
use super::super::constants;
fn delete_matcher() -> DeleteMatcher {
DeleteMatcher::new(constants::DELETE_BITSET_BYTES)
}
fn assert_byte_iter_eq_delete(matcher: &DeleteMatcher, text: &str) {
let materialized: Vec<u8> = match matcher.delete(text) {
Some((s, _)) => s.into_bytes(),
None => text.as_bytes().to_vec(),
};
let streamed: Vec<u8> = matcher.byte_iter(text).collect();
assert_eq!(materialized, streamed, "delete mismatch for: {:?}", text);
}
#[test]
fn delete_byte_iter_matches_delete() {
let m = delete_matcher();
for text in ["", "hello", "hello world", "a b c", "\t\n", "δΈ ζ"] {
assert_byte_iter_eq_delete(&m, text);
}
}
proptest::proptest! {
#![proptest_config(proptest::prelude::ProptestConfig::with_cases(500))]
#[test]
fn prop_delete_byte_iter(text in "\\PC{0,200}") {
let m = delete_matcher();
assert_byte_iter_eq_delete(&m, &text);
}
}
}