use std::borrow::Cow;
use super::{decode_page_table, decode_utf8_raw, page_table_lookup, replace_spans, unpack_str_ref};
struct NormalizeFindIter<'a> {
l1: &'a [u16],
l2: &'a [u32],
strings: &'a str,
text: &'a str,
byte_offset: usize,
}
impl<'a> Iterator for NormalizeFindIter<'a> {
type Item = (usize, usize, &'a str);
#[inline(always)]
fn next(&mut self) -> Option<Self::Item> {
let bytes = self.text.as_bytes();
let len = bytes.len();
loop {
if self.byte_offset >= len {
return None;
}
let b = bytes[self.byte_offset];
if b < 0x80 {
let start = self.byte_offset;
self.byte_offset += 1;
if b.is_ascii_uppercase()
&& let Some(value) = page_table_lookup(b as u32, self.l1, self.l2)
&& let Some(s) = unpack_str_ref(value, self.strings)
{
return Some((start, start + 1, s));
}
continue;
}
let start = self.byte_offset;
let (cp, char_len) = unsafe { decode_utf8_raw(bytes, start) };
self.byte_offset += char_len;
if let Some(value) = page_table_lookup(cp, self.l1, self.l2)
&& let Some(s) = unpack_str_ref(value, self.strings)
{
return Some((start, self.byte_offset, s));
}
}
}
}
pub(crate) struct NormalizeFilterIterator<'a> {
bytes: &'a [u8],
offset: usize,
remaining: &'a [u8],
l1: &'a [u16],
l2: &'a [u32],
strings: &'a str,
}
impl Iterator for NormalizeFilterIterator<'_> {
type Item = u8;
#[inline(always)]
fn next(&mut self) -> Option<u8> {
if let Some((&byte, rest)) = self.remaining.split_first() {
self.remaining = rest;
return Some(byte);
}
if self.offset >= self.bytes.len() {
return None;
}
let byte = unsafe { *self.bytes.get_unchecked(self.offset) };
if byte < 0x80 {
self.offset += 1;
if byte.is_ascii_uppercase()
&& let Some(value) = page_table_lookup(byte as u32, self.l1, self.l2)
&& let Some(s) = unpack_str_ref(value, self.strings)
{
let s_bytes = s.as_bytes();
self.remaining = &s_bytes[1..];
return Some(s_bytes[0]);
}
return Some(byte);
}
let (cp, char_len) = unsafe { decode_utf8_raw(self.bytes, self.offset) };
if let Some(value) = page_table_lookup(cp, self.l1, self.l2)
&& let Some(s) = unpack_str_ref(value, self.strings)
{
self.offset += char_len;
let s_bytes = s.as_bytes();
self.remaining = &s_bytes[1..];
return Some(s_bytes[0]);
}
let cont_start = self.offset + 1;
self.offset += char_len;
self.remaining = &self.bytes[cont_start..self.offset];
Some(byte)
}
}
#[derive(Clone)]
pub(crate) struct NormalizeMatcher {
l1: Box<[u16]>,
l2: Box<[u32]>,
strings: Cow<'static, str>,
}
impl NormalizeMatcher {
#[inline(always)]
fn iter<'a>(&'a self, text: &'a str) -> NormalizeFindIter<'a> {
NormalizeFindIter {
l1: &self.l1,
l2: &self.l2,
strings: self.strings.as_ref(),
text,
byte_offset: 0,
}
}
pub(crate) fn replace(&self, text: &str) -> Option<String> {
replace_spans(text, self.iter(text))
}
pub(crate) fn new(l1: &'static [u8], l2: &'static [u8], strings: &'static str) -> Self {
let (l1, l2) = decode_page_table(l1, l2);
Self {
l1,
l2,
strings: Cow::Borrowed(strings),
}
}
#[inline(always)]
pub(crate) fn filter_bytes<'a>(&'a self, text: &'a str) -> NormalizeFilterIterator<'a> {
NormalizeFilterIterator {
bytes: text.as_bytes(),
offset: 0,
remaining: &[],
l1: &self.l1,
l2: &self.l2,
strings: self.strings.as_ref(),
}
}
}