use super::{decode_page_table, decode_utf8_raw, page_table_lookup, replace_scan, skip_ascii_simd};
pub(crate) struct VariantNormFilterIterator<'a> {
bytes: &'a [u8],
offset: usize,
char_remaining: u8,
replace_buf: [u8; 4],
replace_len: u8,
replace_pos: u8,
l1: &'a [u16],
l2: &'a [u32],
}
impl Iterator for VariantNormFilterIterator<'_> {
type Item = u8;
#[inline(always)]
fn next(&mut self) -> Option<u8> {
if self.replace_pos < self.replace_len {
let byte = self.replace_buf[self.replace_pos as usize];
self.replace_pos += 1;
return Some(byte);
}
if self.char_remaining > 0 {
let byte = unsafe { *self.bytes.get_unchecked(self.offset) };
self.offset += 1;
self.char_remaining -= 1;
return Some(byte);
}
if self.offset >= self.bytes.len() {
return None;
}
let byte = unsafe { *self.bytes.get_unchecked(self.offset) };
if byte < 0x80 {
self.offset += 1;
return Some(byte);
}
let (cp, char_len) = unsafe { decode_utf8_raw(self.bytes, self.offset) };
if let Some(mapped_cp) = page_table_lookup(cp, self.l1, self.l2)
&& mapped_cp != cp
{
self.offset += char_len;
let mapped = unsafe { char::from_u32_unchecked(mapped_cp) };
let len = mapped.len_utf8();
mapped.encode_utf8(&mut self.replace_buf);
self.replace_len = len as u8;
self.replace_pos = 1;
return Some(self.replace_buf[0]);
}
self.offset += 1;
self.char_remaining = (char_len - 1) as u8;
Some(byte)
}
}
struct VariantNormFindIter<'a> {
l1: &'a [u16],
l2: &'a [u32],
text: &'a str,
byte_offset: usize,
}
impl<'a> Iterator for VariantNormFindIter<'a> {
type Item = (usize, usize, char);
#[inline(always)]
fn next(&mut self) -> Option<Self::Item> {
let bytes = self.text.as_bytes();
let len = bytes.len();
loop {
self.byte_offset = skip_ascii_simd(bytes, self.byte_offset);
if self.byte_offset >= len {
return None;
}
let start = self.byte_offset;
let (cp, char_len) = unsafe { decode_utf8_raw(bytes, start) };
self.byte_offset += char_len;
if let Some(mapped_cp) = page_table_lookup(cp, self.l1, self.l2)
&& mapped_cp != cp
{
debug_assert!(char::from_u32(mapped_cp).is_some());
let mapped = unsafe { char::from_u32_unchecked(mapped_cp) };
return Some((start, self.byte_offset, mapped));
}
}
}
}
#[derive(Clone)]
pub(crate) struct VariantNormMatcher {
l1: Box<[u16]>,
l2: Box<[u32]>,
}
impl VariantNormMatcher {
#[inline(always)]
fn iter<'a>(&'a self, text: &'a str) -> VariantNormFindIter<'a> {
VariantNormFindIter {
l1: &self.l1,
l2: &self.l2,
text,
byte_offset: 0,
}
}
pub(crate) fn replace(&self, text: &str) -> Option<String> {
replace_scan(text, self.iter(text))
}
#[inline(always)]
pub(crate) fn filter_bytes<'a>(&'a self, text: &'a str) -> VariantNormFilterIterator<'a> {
VariantNormFilterIterator {
bytes: text.as_bytes(),
offset: 0,
char_remaining: 0,
replace_buf: [0; 4],
replace_len: 0,
replace_pos: 0,
l1: &self.l1,
l2: &self.l2,
}
}
pub(crate) fn new(l1: &'static [u8], l2: &'static [u8]) -> Self {
let (l1, l2) = decode_page_table(l1, l2);
Self { l1, l2 }
}
}